1 Setup

1.1 Packages

library(tidyverse)
library(cowplot)
library(GGally)
library(heatmaply)
library(sva)
library(limma)
library(broom)
library(ggridges)

1.2 Data

1.2.1 Metabolite Abundances

# Cells #
vpa.cell.neg.raw <- read_csv("./data/abundances/vpa_1and2_cells_target_negmode.csv")
vpa.cell.pos.raw <- read_csv("./data/abundances/vpa_1and2_cells_target_posmode.csv")
# Media #
vpa.med.neg.raw <- read_csv("./data/abundances/vpa_1and2_media_target_negmode.csv")
vpa.med.pos.raw <- read_csv("./data/abundances/vpa_1and2_media_target_posmode.csv")

1.2.2 Compound Information

# Cells #
vpa.cell.neg.compound.info <- read_csv("./data/compound_info/vpa_1and2_cells_target_negmode_cmpnd_info.csv")
vpa.cell.pos.compound.info <- read_csv("./data/compound_info/vpa_1and2_cells_target_posmode_cmpnd_info.csv")
# Media #
vpa.med.neg.compound.info <- read_csv("./data/compound_info/vpa_1and2_media_target_negmode_cmpnd_info.csv")
vpa.med.pos.compound.info <- read_csv("./data/compound_info/vpa_1and2_media_target_posmode_cmpnd_info.csv")
# Kegg/other ID reference #
cmpnd.id.db <- read_csv("./data/other/anp_db_compound_kegg.csv")
# run order and protein quant (cell samples)
# although run order is the same for media samples
vpa.cell.other.info <- read_csv("./data/other/vpa_exp1_other_info.csv")

1.3 Functions

MissingPerSamplePlot <- function(raw.data, start.string) {
  # Counts the number of missing/NA values per sample and
  # percent compounds missing out of total number of compounds per sample
  # Then passes the result into a vertical bar plot, where each 
  # bar represents a single sample and the size of the bar 
  # is the % of compounds missing
  
  counted.na <- raw.data %>%
  select(starts_with(start.string)) %>% 
  mutate(
    count.na = apply(., 1, function(x) sum(is.na(x))),
    percent.na = (count.na / ncol(raw.data %>% select(starts_with(start.string)))) * 100
    ) %>%
  dplyr::select(count.na, percent.na) %>%
  bind_cols(
    raw.data %>% 
      select(Samples, Group)
      ) %>% 
  arrange(percent.na) %>% 
  mutate(f.order = factor(Samples, levels = Samples))
counted.na %>% 
  ggplot(aes(x = f.order, y = percent.na, fill = Group)) +
  geom_bar(stat = "identity") +
  geom_hline(yintercept = 20, color = "gray", size = 1, alpha = 0.8) +
  coord_flip()+
  xlab("Samples") +
  ylab("Percent missing values in sample") +
  theme(axis.text.y = element_text(size = 6)) 
}
MissingPerCompound <- function(raw.data, start.string){
  # Function to count in how many experimental samples each compound is missing
  # Also calculates the % missing:
  # (# NA per compound across all experimental samples) * 100 / (tot num of samples)
  
  raw.data %>% 
  filter(Group == "sample") %>% 
  select(Samples, starts_with(start.string)) %>% 
  gather(key = "Compound", value = "Abundance", -Samples) %>% 
  group_by(Compound) %>% 
  summarise(
    na_count = sum(is.na(Abundance)),
    n_samples = n(),
    percent_na = (na_count * 100 / n_samples)
    ) %>% 
  filter(na_count > 0) %>% 
  arrange(desc(percent_na))
}
ReplaceNAwMinLogTransformSingle <- function(raw.dataframe, start.prefix) {
  # Function to replace any NAs in each column with the minimum for that column, 
  # separately for each sample type.
  # NA in negative control samples are replaced by 2.
  # Then data is log2 transformed
  
  smpls <- raw.dataframe %>%
    filter(Group == "sample") %>% 
    dplyr::select(starts_with(start.prefix))
  smpls.min <- lapply(smpls, min, na.rm = TRUE)
  # replace the missing values in the real samples with the minimum of the samples
  # then take the log
  smpls.noNA <- raw.dataframe  %>%
    filter(Group == "sample") %>% 
    dplyr::select(Samples:Experiment) %>%
    bind_cols(
      smpls %>%
        replace_na(replace = smpls.min) %>% 
        log2()
      )
  # QC
  QC <- raw.dataframe %>%
    filter(Group == "QC") %>% 
    dplyr::select(starts_with(start.prefix))
  QC.min <- lapply(QC, min, na.rm = TRUE)
  # replace the missing values in the QC with the minimum of the QC
  # then take the log
  QC.noNA <- raw.dataframe  %>%
    filter(Group == "QC") %>% 
    dplyr::select(Samples:Experiment) %>%
    bind_cols(
      QC %>%
        replace_na(replace = QC.min) %>% 
        log2()
      )
  # replace the missing values in solv and empty samples with 2 - for PCA analysis
  # then take the log
  other.min <- setNames(
    as.list(
      rep(2, ncol(
        raw.dataframe %>% 
          dplyr::select(starts_with(start.prefix))))
      ),
    colnames(raw.dataframe %>% dplyr::select(starts_with(start.prefix)))
                )
  other.num.log <- raw.dataframe %>%
    filter(Group != "sample" & Group != "QC") %>% 
    dplyr::select(Samples:Experiment) %>%
    bind_cols(
      raw.dataframe %>% 
        filter(Group != "sample" & Group != "QC") %>% 
        dplyr::select(starts_with(start.prefix)) %>%
        replace_na(replace = other.min) %>% 
        log2()
      )
  # combine them together back into one data frame
  all.noNA <- smpls.noNA %>% 
    bind_rows(QC.noNA) %>% 
    bind_rows(other.num.log)
}
HeatmapPrepAlt <- function(raw.data, start.prefix){
  # function for preparing dara for heatmap viz
  x <- raw.data %>% 
    select(starts_with(start.prefix)) %>% 
    scale(center = TRUE, scale = TRUE) %>% 
    as.matrix() 
  row.names(x) <- raw.data$Samples
  return(x)
}

2 Data Exploration

2.1 Mass vs Retention Time

Q: What are the distributions of compound masses and retention times?

# bind all 4 compound info df into 1 
full.vpa.cmpnd <- vpa.cell.neg.compound.info %>% 
  mutate(Set = "Cells / Neg") %>% 
  bind_rows(vpa.cell.pos.compound.info %>% mutate(Set = "Cells / Pos")) %>% 
  bind_rows(vpa.med.neg.compound.info %>% mutate(Set = "Media / Neg")) %>%
  bind_rows(vpa.med.pos.compound.info %>% mutate(Set = "Media / Pos"))
full.vpa.cmpnd %>% 
  ggplot(aes(x = rt, y = mass)) +
  geom_point(size = 3, alpha = 0.3) +
  xlab("Retention Time (min)") +
  ylab("Mass (Da)") +
  ggtitle("Mass v RT\nVPA Only Exp") +
  ylim(0, 1100)

full.vpa.cmpnd %>% 
  ggplot(aes(x = rt, y = mass, color = Set)) +
  geom_point(size = 3, alpha = 0.5) +
  xlab("Retention Time (min)") +
  ylab("Mass (Da)") +
  ggtitle("Mass v RT\nVPA Only Exp") +
  facet_wrap(~ Set) +
  ylim(0, 1100)

The compounds included in the Cells / Neg mode and Cells / Pos mode datasets loook to have a good spread of masses and retention times within the expected RT window of 0 to 29min and mass window of 50 to 1000Da. A smaller number of compounds were detected in the media samples, and the larger molecules that were detected in cells were not detected in media. This is not a surprising result, media in general is expected to have a lower number of compounds than the cells themselves, as cells produce a wide variety of molecules that are not exported to the media.

Q: Which compounds were found in one or more of the data types?

### vpa cell join ###
vpa.cell.cmpnd.join <- vpa.cell.neg.compound.info %>% 
  inner_join(vpa.cell.pos.compound.info, by = "cas_id", suffix = c(".c.n", ".c.p")) %>% 
  select(
    contains("cas_id"), contains("short"), 
    contains("full"), starts_with("formula"), 
    starts_with("mass"), starts_with("rt")
    )
# compound names - found in pos and neg mode / cells 
print(vpa.cell.cmpnd.join$compound_full.c.n)
  [1] "Glycine"                                    
  [2] "Acetoin"                                    
  [3] "Alanine"                                    
  [4] "Sarcosine"                                  
  [5] "Beta-Alanine"                               
  [6] "2-Aminobutyric Acid"                        
  [7] "BAIBA"                                      
  [8] "Serine"                                     
  [9] "Hypotaurine"                                
 [10] "Uracil"                                     
 [11] "Creatinine"                                 
 [12] "5,6-Dihydrouracil"                          
 [13] "Proline"                                    
 [14] "Valine"                                     
 [15] "Threonine"                                  
 [16] "Taurine"                                    
 [17] "4-Oxoproline"                               
 [18] "Ketoleucine"                                
 [19] "trans-4-Hydroxyproline"                     
 [20] "Creatine"                                   
 [21] "Isoleucine"                                 
 [22] "Leucine"                                    
 [23] "Asparagine"                                 
 [24] "5-Hydroxyhexanoic Acid"                     
 [25] "Aspartic Acid"                              
 [26] "Adenine"                                    
 [27] "Hypoxanthine"                               
 [28] "O-Phosphoethanolamine"                      
 [29] "Glutamine"                                  
 [30] "Lysine"                                     
 [31] "N-Acetylserine"                             
 [32] "Glutamic Acid"                              
 [33] "Methionine"                                 
 [34] "D-Ribose"                                   
 [35] "Guanine"                                    
 [36] "Xanthine"                                   
 [37] "3-Sulfinoalanine"                           
 [38] "Histidine"                                  
 [39] "Orotic Acid"                                
 [40] "Allantoin"                                  
 [41] "Fucose"                                     
 [42] "Phenylalanine"                              
 [43] "Pyridoxal"                                  
 [44] "Cysteic Acid"                               
 [45] "Pyridoxine"                                 
 [46] "3-Methylhistidine"                          
 [47] "G3P"                                        
 [48] "Glycerol 1-Phosphate / Glycerol 3-Phosphate"
 [49] "N-Carbamoyl-L-aspartic Acid"                
 [50] "Tyrosine"                                   
 [51] "D-Galactitol"                               
 [52] "Phosphocholine"                             
 [53] "N-alpha-Acetylglutamine"                    
 [54] "Tryptophan"                                 
 [55] "Phosphocreatine"                            
 [56] "Glycerylphosphorylethanolamine"             
 [57] "O-Succinylhomoserine"                       
 [58] "Pantothenic Acid"                           
 [59] "GlcNAc"                                     
 [60] "Cystathionine"                              
 [61] "Deoxycytidine"                              
 [62] "Cytidine"                                   
 [63] "Uridine"                                    
 [64] "Palmitoleic Acid"                           
 [65] "Glycerol 1-phosphoserine"                   
 [66] "D-Glucose 6-phosphate"                      
 [67] "Thiamine"                                   
 [68] "Inosine"                                    
 [69] "Myoinositol-methyl-phosphate"               
 [70] "Linoleic Acid"                              
 [71] "Guanosine"                                  
 [72] "Xanthosine"                                 
 [73] "L-Arginosuccinic Acid"                      
 [74] "Ricinoleic Acid"                            
 [75] "Glutathione"                                
 [76] "11Z,14Z-Eicosadienoic Acid (20:2 n-6)"      
 [77] "CMP"                                        
 [78] "UMP"                                        
 [79] "3-Phosphoglyceroinositol"                   
 [80] "AMP"                                        
 [81] "GMP"                                        
 [82] "SAH"                                        
 [83] "CDP"                                        
 [84] "ADP"                                        
 [85] "GDP"                                        
 [86] "LysoPE(18:2)"                               
 [87] "LysoPE(18:0)"                               
 [88] "dTTP"                                       
 [89] "CTP"                                        
 [90] "UTP"                                        
 [91] "CDP-Choline"                                
 [92] "dATP"                                       
 [93] "ATP"                                        
 [94] "GTP"                                        
 [95] "ADP-Ribose"                                 
 [96] "UDP-Galactose"                              
 [97] "UDP-Glucose"                                
 [98] "UDP-Glucuronic acid"                        
 [99] "ADP-Glucose"                                
[100] "GDP-Glucose"                                
[101] "UDP-GalNAc"                                 
[102] "UDP-GlcNAc"                                 
[103] "GSSG"                                       
[104] "NAD"                                        
[105] "NADH"                                       
[106] "NADP"                                       
[107] "PC(36:4)"                                   
[108] "FAD"                                        
[109] "Acetyl-CoA"                                 
# percent of cell / neg compounds found in cell / pos 
round(nrow(vpa.cell.cmpnd.join) * 100 / nrow(vpa.cell.neg.compound.info), 1)
[1] 51.7
# percent of cell / neg compounds found in cell / pos 
round(nrow(vpa.cell.cmpnd.join) * 100 / nrow(vpa.cell.pos.compound.info), 1)
[1] 61.9
### vpa media join ###
vpa.med.cmpnd.join <- vpa.med.neg.compound.info %>% 
  inner_join(vpa.med.pos.compound.info, by = "cas_id", suffix = c(".m.n", ".m.p")) %>% 
  select(
    contains("cas_id"), contains("short"), 
    contains("full"), starts_with("formula"), 
    starts_with("mass"), starts_with("rt")
    )
# compound names - found in pos and neg mode / media
print(vpa.med.cmpnd.join$compound_full.m.n)
 [1] "Alanine"                  "Serine"                  
 [3] "Uracil"                   "Valine"                  
 [5] "Threonine"                "Taurine"                 
 [7] "Thymine"                  "4-Oxoproline"            
 [9] "Isoleucine"               "Leucine"                 
[11] "Glutamine"                "Methionine"              
[13] "Xanthine"                 "Histidine"               
[15] "Allantoin"                "Phenylalanine"           
[17] "Uric Acid"                "D-Glucose"               
[19] "Tyrosine"                 "D-Galactitol"            
[21] "Cysteine-S-sulfonic Acid" "Tryptophan"              
[23] "Pantothenic Acid"         "Cytidine"                
[25] "Uridine"                  "Inosine"                 
[27] "Guanosine"                "Folic Acid"              
# percent of media / neg compounds found in media / pos
round(nrow(vpa.med.cmpnd.join) * 100 / nrow(vpa.med.neg.compound.info), 1)
[1] 51.9
# percent of media / pos compounds found in media / neg
round(nrow(vpa.med.cmpnd.join) * 100 / nrow(vpa.med.pos.compound.info), 1)
[1] 50
### vpa all modes join ###
vpa.all.cmpnd.join <- vpa.cell.cmpnd.join %>% 
  inner_join(vpa.med.cmpnd.join, by = "cas_id") %>% 
  select(
    contains("cas_id"), contains("short"), 
    contains("full"), starts_with("formula"), 
    starts_with("mass"), starts_with("rt")
    )
nrow(vpa.all.cmpnd.join)
[1] 23
# check for any mass issues between all 4 modes 
vpa.all.cmpnd.join %>% 
  select(contains("mass")) %>% 
  ggpairs()

# any rt inconsistencies?
vpa.all.cmpnd.join %>% 
  select(starts_with("rt")) %>% 
  ggpairs()

2.2 Missing Values

Q: Do any of the samples have greater than 20% missing (NA) compound abundances, out of all of the features in the dataset?

MissingPerSamplePlot(vpa.cell.neg.raw, "VPAnC") +
  ggtitle("Missing Per Sample\nVPA-only / Cells / Neg Mode")

MissingPerSamplePlot(vpa.cell.pos.raw, "VPApC") +
  ggtitle("Missing Per Sample\nVPA-only / Cells / Pos Mode")

MissingPerSamplePlot(vpa.med.neg.raw, "VPAnM") +
  ggtitle("Missing Per Sample\nVPA-only / Media / Neg Mode")

MissingPerSamplePlot(vpa.med.pos.raw, "VPApM") +
  ggtitle("Missing Per Sample\nVPA-only / Media / Pos Mode")

A: No, sample files across both datasets have very few missing values. The green-colored bars, marked “sample” are the actual experimental samples. Whereas the “solv”, or “solvent”, and “empty” samples are negative control samples that are expected to have many missing values and/or low compound abundances. They will be used to narrow down the list of features later on in the analysis.

Q: Are any of the compounds more than 20% missing in the experimental sample group? If there are any, they will be excluded from analysis.

Note: The MissingPerCompound function considers only the Group == "sample" samples.

MissingPerCompound(vpa.cell.neg.raw, "VPAnC") %>% 
  filter(percent_na > 20)
# A tibble: 0 x 4
# ... with 4 variables: Compound <chr>, na_count <int>, n_samples <int>,
#   percent_na <dbl>
(vpa.cell.pos.cmpnd.to.excl<- MissingPerCompound(vpa.cell.pos.raw, "VPApC") %>% 
  filter(percent_na > 20))
# A tibble: 1 x 4
  Compound na_count n_samples percent_na
  <chr>       <int>     <int>      <dbl>
1 VPApC110        9        23       39.1
MissingPerCompound(vpa.med.neg.raw, "VPAnM") %>% 
  filter(percent_na > 20)
# A tibble: 0 x 4
# ... with 4 variables: Compound <chr>, na_count <int>, n_samples <int>,
#   percent_na <dbl>
(vpa.med.pos.cmpnd.to.excl <- MissingPerCompound(vpa.med.pos.raw, "VPApM") %>% 
  filter(percent_na > 20))
# A tibble: 1 x 4
  Compound na_count n_samples percent_na
  <chr>       <int>     <int>      <dbl>
1 VPApM34         6        24         25

A: No compounds need to be excluded from the Cell / Neg Mode and Media / Neg Mode datasets, but 1 compound each will be excluded from the Cell / Pos Mode and Media / Pos Mode datasets.

2.3 Negative Controls and Compound Elimination

2.3.1 Cells / Negative Mode

# get the mean abundance of each compound, grouped by solvent vs empty sample vs experimental sample
vpa.cell.neg.raw.grp.mean <- vpa.cell.neg.raw %>% 
  group_by(Group) %>% 
  summarize_at(vars(matches("VPAnC")), mean, na.rm = TRUE) %>% 
  gather(key = "Compound", value = "Grp_mean_abun", -Group)
# plot the log2 density distribution of the means
vpa.cell.neg.raw.grp.mean %>% 
  ggplot(aes(log2(Grp_mean_abun), color = Group)) +
  geom_density(size = 2, alpha = 0.8) +
  ggtitle("Distribution of compound means\nVPA-only / Cells / Negative Mode\nGrouped by sample type")

# for plotting purposes, to make the plot neater
vpa.cell.neg.raw.grp.mean.order <- vpa.cell.neg.raw.grp.mean %>% 
  filter(Group == "sample") %>% 
  arrange(Grp_mean_abun)
vpa.cell.neg.raw %>% 
  select(Samples, Group, starts_with("VPAnC")) %>% 
  gather("Compound", value = "Abundance", -c(Samples, Group)) %>% 
  mutate(Cmpnd_sort = factor(Compound, levels = vpa.cell.neg.raw.grp.mean.order$Compound)) %>% 
  ggplot(aes(Cmpnd_sort, log2(Abundance), color = Group, group = Samples)) + 
  geom_line(alpha = 0.1, size = 1) + 
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
  xlab("Compound") +
  # overlay the group averages 
  geom_line(
    data = vpa.cell.neg.raw.grp.mean %>% 
      mutate(Cmpnd_sort = factor(Compound, levels = vpa.cell.neg.raw.grp.mean.order$Compound)), 
    aes(Cmpnd_sort, log2(Grp_mean_abun), color = Group, group = Group),
    size = 0.5
    ) +
  ggtitle("Profile Plot of all compound abundances\nWith average per sample type overlaid\nVPA-only / Cells / Negative Mode")

# compound mean by group only, ordered by increasing abundance in the experimental samples
vpa.cell.neg.raw.grp.mean %>% 
  mutate(Cmpnd_sort = factor(Compound, levels = vpa.cell.neg.raw.grp.mean.order$Compound)) %>% 
  ggplot(aes(Cmpnd_sort, log2(Grp_mean_abun), color = Group, group = Group)) +
  geom_point(size = 1, alpha = 0.8) +
  geom_line(alpha = 0.8) +
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
  xlab("Compound") +
  ylab("log2(Sample Type Mean)") +
  ggtitle("Profile Plot of compound means by sample type only\nVPA-only / Cells / Negative Mode")

vpa.cell.neg.raw.grp.diff <- vpa.cell.neg.raw.grp.mean %>% 
  spread(Group, Grp_mean_abun) %>% 
  mutate(
    smpl_empty_diff = sample / empty,
    smpl_solv_diff = sample / solv,
    empty_solv_diff = empty / solv
    )
vpa.cell.neg.raw.grp.diff %>% 
  ggplot(aes(log2(smpl_empty_diff))) +
  geom_histogram(bins = 50)

vpa.cell.neg.raw.grp.diff %>% 
  ggplot(aes(log2(smpl_solv_diff))) +
  geom_histogram(bins = 50)

vpa.cell.neg.raw.grp.diff %>% 
  ggplot(aes(log2(smpl_empty_diff), log2(smpl_solv_diff))) +
  geom_point(size = 2, alpha = 0.5) +
  xlim(-3, 13.5) +
  ylim(-3, 13.5) +
  # abline in pink
  geom_abline(intercept = 0, slope = 1, color =  "#CC79A7", size = 2, alpha = 0.6) +
  # lm line in green
  geom_smooth(method = "lm", color = "#009E73", size = 2, alpha = 0.6, se = FALSE) +
  xlab("log2([Sample Compound Mean] / [Empty Compound Mean])") +
  ylab("log2([Sample Compound Mean] / [Solvent Compound Mean])") +
  ggtitle("Background signal\nRaw Data / Cells / Neg Mode")

# include compounds with FC > 2.5 or FC is NA (indication of NA in solv or empty)
vpa.cell.neg.cmpnd.to.incl <- vpa.cell.neg.raw.grp.diff %>% 
  filter((smpl_solv_diff > 2.5 & smpl_empty_diff > 2.5) | is.na(smpl_solv_diff) | is.na(smpl_empty_diff))
# how many compound were there in the raw negative mode dataset?
nrow(vpa.cell.neg.raw.grp.diff)
[1] 211
# how many compounds are retained for further analysis?
nrow(vpa.cell.neg.cmpnd.to.incl)
[1] 179

2.3.2 Cells / Positive Mode

vpa.cell.pos.raw.grp.mean <- vpa.cell.pos.raw %>% 
  group_by(Group) %>% 
  summarize_at(vars(matches("VPApC")), mean, na.rm = TRUE) %>% 
  gather(key = "Compound", value = "Grp_mean_abun", -Group)
vpa.cell.pos.raw.grp.mean %>% 
  ggplot(aes(log2(Grp_mean_abun), color = Group)) +
  geom_density(size = 2, alpha = 0.8) +
  ggtitle("Distribution of compound means\nVPA-only / Cells / Positive Mode\nGrouped by sample type")

vpa.cell.pos.raw.grp.mean.order <- vpa.cell.pos.raw.grp.mean %>% 
  filter(Group == "sample") %>% 
  arrange(Grp_mean_abun)
vpa.cell.pos.raw %>% 
  select(Samples, Group, starts_with("VPApC")) %>% 
  gather("Compound", value = "Abundance", -c(Samples, Group)) %>% 
  mutate(Cmpnd_sort = factor(Compound, levels = vpa.cell.pos.raw.grp.mean.order$Compound)) %>% 
  ggplot(aes(Cmpnd_sort, log2(Abundance), color = Group, group = Samples)) + 
  geom_line(alpha = 0.1, size = 1) + 
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
  xlab("Compound") +
  # overlay the group averages 
  geom_line(
    data = vpa.cell.pos.raw.grp.mean %>% 
      mutate(Cmpnd_sort = factor(Compound, levels = vpa.cell.pos.raw.grp.mean.order$Compound)), 
    aes(Cmpnd_sort, log2(Grp_mean_abun), color = Group, group = Group),
    size = 0.5
    ) +
  ggtitle("Profile Plot of all compound abundances\nWith average per sample type overlaid\nVPA-only / Cells / Positive Mode")

vpa.cell.pos.raw.grp.mean %>% 
  mutate(Cmpnd_sort = factor(Compound, levels = vpa.cell.pos.raw.grp.mean.order$Compound)) %>% 
  ggplot(aes(Cmpnd_sort, log2(Grp_mean_abun), color = Group, group = Group)) +
  geom_point(size = 1, alpha = 0.8) +
  geom_line(alpha = 0.8) +
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
  xlab("Compound") +
  ylab("log2(Sample Type Mean)")  +
  ggtitle("Profile Plot of compound means by sample type only\nVPA-only / Cells / Positive Mode")

vpa.cell.pos.raw.grp.diff <- vpa.cell.pos.raw.grp.mean %>% 
  spread(Group, Grp_mean_abun) %>% 
  mutate(
    smpl_empty_diff = sample / empty,
    smpl_solv_diff = sample / solv,
    empty_solv_diff = empty / solv
    )
vpa.cell.pos.raw.grp.diff %>% 
  ggplot(aes(log2(smpl_empty_diff))) +
  geom_histogram(bins = 50)

vpa.cell.pos.raw.grp.diff %>% 
  ggplot(aes(log2(smpl_solv_diff))) +
  geom_histogram(bins = 50)

vpa.cell.pos.raw.grp.diff %>% 
  ggplot(aes(log2(smpl_empty_diff), log2(smpl_solv_diff))) +
  geom_point(size = 2, alpha = 0.5) +
  xlim(-1, 15) +
  ylim(-1, 15) +
  geom_abline(intercept = 0, slope = 1, color =  "#CC79A7", size = 2, alpha = 0.6) +
  geom_smooth(method = "lm", color = "#009E73", size = 2, alpha = 0.6, se = FALSE) +
  xlab("log2([Sample Compound Mean] / [Empty Compound Mean])") +
  ylab("log2([Sample Compound Mean] / [Solvent Compound Mean])") +
  ggtitle("Background signal\nRaw Data / Cells / Pos Mode")

# include compounds with FC > 2.5 or FC is NA (indication of NA in solv or empty)
vpa.cell.pos.cmpnd.to.incl <- vpa.cell.pos.raw.grp.diff %>% 
  filter((smpl_solv_diff > 2.5 & smpl_empty_diff > 2.5) | is.na(smpl_solv_diff) | is.na(smpl_empty_diff)) %>% 
  filter(!(Compound %in% vpa.cell.pos.cmpnd.to.excl$Compound))
# how many compound were there in the raw dataset?
nrow(vpa.cell.pos.raw.grp.diff)
[1] 176
# how many compounds are retained for further analysis?
nrow(vpa.cell.pos.cmpnd.to.incl)
[1] 142

2.3.3 Media / Negative Mode

vpa.med.neg.raw.grp.mean <- vpa.med.neg.raw %>% 
  group_by(Group) %>% 
  summarize_at(vars(matches("VPAnM")), mean, na.rm = TRUE) %>% 
  gather(key = "Compound", value = "Grp_mean_abun", -Group)
vpa.med.neg.raw.grp.mean %>% 
  ggplot(aes(log2(Grp_mean_abun), color = Group)) +
  geom_density(size = 2, alpha = 0.8) +
  ggtitle("Distribution of compound means\nVPA-only / Media / Negative Mode\nGrouped by sample type")

vpa.med.neg.raw.grp.mean.order <- vpa.med.neg.raw.grp.mean %>% 
  filter(Group == "empty") %>% 
  arrange(Grp_mean_abun)
vpa.med.neg.raw %>% 
  select(Samples, Group, starts_with("VPAnM")) %>% 
  gather("Compound", value = "Abundance", -c(Samples, Group)) %>% 
  mutate(Cmpnd_sort = factor(Compound, levels = vpa.med.neg.raw.grp.mean.order$Compound)) %>% 
  ggplot(aes(Cmpnd_sort, log2(Abundance), color = Group, group = Samples)) + 
  geom_line(alpha = 0.1, size = 1.5) + 
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
  xlab("Compound") +
  # overlay the group averages 
  geom_line(
    data = vpa.med.neg.raw.grp.mean %>% 
      mutate(Cmpnd_sort = factor(Compound, levels = vpa.med.neg.raw.grp.mean.order$Compound)), 
    aes(Cmpnd_sort, log2(Grp_mean_abun), color = Group, group = Group),
    size = 0.5
    ) +
  ggtitle("Profile Plot of all compound abundances\nWith average per sample type overlaid\nVPA-only / Media / Negative Mode")

vpa.med.neg.raw.grp.mean %>% 
  mutate(Cmpnd_sort = factor(Compound, levels = vpa.med.neg.raw.grp.mean.order$Compound)) %>% 
  ggplot(aes(Cmpnd_sort, log2(Grp_mean_abun), color = Group, group = Group)) +
  geom_point(size = 1, alpha = 0.8) +
  geom_line(alpha = 0.8) +
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
  xlab("Compound") +
  ylab("log2(Sample Type Mean)")

vpa.med.neg.raw.grp.diff <- vpa.med.neg.raw.grp.mean %>% 
  spread(Group, Grp_mean_abun) %>% 
  mutate(
    smpl_empty_diff = sample / empty,
    smpl_solv_diff = sample / solv,
    empty_solv_diff = empty / solv
    )
vpa.med.neg.raw.grp.diff %>% 
  ggplot(aes(log2(smpl_empty_diff))) +
  geom_histogram(bins = 25)

vpa.med.neg.raw.grp.diff %>% 
  ggplot(aes(log2(smpl_solv_diff))) +
  geom_histogram(bins = 25)

vpa.med.neg.raw.grp.diff %>% 
  ggplot(aes(log2(smpl_empty_diff), log2(smpl_solv_diff))) +
  geom_point(size = 2, alpha = 0.5) +
  xlim(-4, 4) +
  ylim(-1, 4) +
  # abline in pink
  geom_abline(intercept = 0, slope = 1, color =  "#CC79A7", size = 2, alpha = 0.6) +
  # lm line in green
  geom_smooth(method = "lm", color = "#009E73", size = 2, alpha = 0.6, se = FALSE) +
  xlab("log2([Sample Compound Mean] / [Empty Compound Mean])") +
  ylab("log2([Sample Compound Mean] / [Solvent Compound Mean])") +
  ggtitle("Background signal\nRaw Data / Media / Neg Mode")

# include compounds with FC > 2.5 or FC is NA (indication of NA in solv)
vpa.med.neg.cmpnd.to.incl <- vpa.med.neg.raw.grp.diff %>% 
  filter(smpl_solv_diff > 2.5 | is.na(smpl_solv_diff))
# how many compound were there in the raw dataset?
nrow(vpa.med.neg.raw.grp.diff)
[1] 54
# how many compounds are retained for further analysis?
nrow(vpa.med.neg.cmpnd.to.incl)
[1] 41

2.3.4 Media / Positive Mode

vpa.med.pos.raw.grp.mean <- vpa.med.pos.raw %>% 
  group_by(Group) %>% 
  summarize_at(vars(matches("VPApM")), mean, na.rm = TRUE) %>% 
  gather(key = "Compound", value = "Grp_mean_abun", -Group)
vpa.med.pos.raw.grp.mean %>% 
  ggplot(aes(log2(Grp_mean_abun), color = Group)) +
  geom_density(size = 2, alpha = 0.8) +
  ggtitle("Distribution of compound means\nVPA-only / Media / Positive Mode\nGrouped by sample type")

vpa.med.pos.raw.grp.mean.order <- vpa.med.pos.raw.grp.mean %>% 
  filter(Group == "empty") %>% 
  arrange(Grp_mean_abun)
vpa.med.pos.raw %>% 
  select(Samples, Group, starts_with("VPApM")) %>% 
  gather("Compound", value = "Abundance", -c(Samples, Group)) %>% 
  mutate(Cmpnd_sort = factor(Compound, levels = vpa.med.pos.raw.grp.mean.order$Compound)) %>% 
  ggplot(aes(Cmpnd_sort, log2(Abundance), color = Group, group = Samples)) + 
  geom_line(alpha = 0.1, size = 1.5) + 
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
  xlab("Compound") +
  # overlay the group averages 
  geom_line(
    data = vpa.med.pos.raw.grp.mean %>% 
      mutate(Cmpnd_sort = factor(Compound, levels = vpa.med.pos.raw.grp.mean.order$Compound)), 
    aes(Cmpnd_sort, log2(Grp_mean_abun), color = Group, group = Group),
    size = 0.5
    ) +
  ggtitle("Profile Plot of all compound abundances\nWith average per sample type overlaid\nVPA-only / Media / Positive Mode")

vpa.med.pos.raw.grp.mean %>% 
  mutate(Cmpnd_sort = factor(Compound, levels = vpa.med.pos.raw.grp.mean.order$Compound)) %>% 
  ggplot(aes(Cmpnd_sort, log2(Grp_mean_abun), color = Group, group = Group)) +
  geom_point(size = 1, alpha = 0.8) +
  geom_line(alpha = 0.8) +
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
  xlab("Compound") +
  ylab("log2(Sample Type Mean)")

vpa.med.pos.raw.grp.diff <- vpa.med.pos.raw.grp.mean %>% 
  spread(Group, Grp_mean_abun) %>% 
  mutate(
    smpl_empty_diff = sample / empty,
    smpl_solv_diff = sample / solv,
    empty_solv_diff = empty / solv
    )
vpa.med.pos.raw.grp.diff %>% 
  ggplot(aes(log2(smpl_empty_diff))) +
  geom_histogram(bins = 25)

vpa.med.pos.raw.grp.diff %>% 
  ggplot(aes(log2(smpl_solv_diff))) +
  geom_histogram(bins = 25)

vpa.med.pos.raw.grp.diff %>% 
  ggplot(aes(log2(smpl_empty_diff), log2(smpl_solv_diff))) +
  geom_point(size = 2, alpha = 0.5) +
  xlim(-6, 2) +
  ylim(-1, 7) +
  # abline in pink
  geom_abline(intercept = 0, slope = 1, color =  "#CC79A7", size = 2, alpha = 0.6) +
  # lm line in green
  geom_smooth(method = "lm", color = "#009E73", size = 2, alpha = 0.6, se = FALSE) +
  xlab("log2([Sample Compound Mean] / [Empty Compound Mean])") +
  ylab("log2([Sample Compound Mean] / [Solvent Compound Mean])") +
  ggtitle("Background signal\nRaw Data / Media / pos Mode")

# include compounds with FC > 2.5 or FC is NA (indication of NA in solv)
vpa.med.pos.cmpnd.to.incl <- vpa.med.pos.raw.grp.diff %>% 
  filter(smpl_solv_diff > 2.5 | is.na(smpl_solv_diff)) %>% 
  filter(!(Compound %in% vpa.med.pos.cmpnd.to.excl$Compound))
# how many compound were there in the raw dataset?
nrow(vpa.med.pos.raw.grp.diff)
[1] 56
# how many compounds are retained for further analysis?
nrow(vpa.med.pos.cmpnd.to.incl)
[1] 36

3 Data Prep and Preliminary Analysis

3.1 Cleanup

# replace missing with minimum for each Group in each dataset and log2() transform the data:
# exclude compound that have a >20% NA count across samples
vpa.cell.neg.noNA <- vpa.cell.neg.raw %>% 
  select(Samples:Experiment, one_of(vpa.cell.neg.cmpnd.to.incl$Compound)) %>% 
  ReplaceNAwMinLogTransformSingle("VPAnC")
vpa.cell.pos.noNA <- vpa.cell.pos.raw %>% 
  select(Samples:Experiment, one_of(vpa.cell.pos.cmpnd.to.incl$Compound)) %>% 
  ReplaceNAwMinLogTransformSingle("VPApC")
vpa.med.neg.noNA <- vpa.med.neg.raw %>% 
  select(Samples:Experiment, one_of(vpa.med.neg.cmpnd.to.incl$Compound)) %>% 
  ReplaceNAwMinLogTransformSingle("VPAnM")
vpa.med.pos.noNA <- vpa.med.pos.raw %>% 
  select(Samples:Experiment, one_of(vpa.med.pos.cmpnd.to.incl$Compound)) %>%  
  ReplaceNAwMinLogTransformSingle("VPApM")

3.2 Distribution plots

3.2.1 Cells / Negative Mode

vpa.cell.neg.noNA.gathered <- vpa.cell.neg.noNA %>% 
  gather(
    key = "Metabolite", "Abundance", 
    which(colnames(vpa.cell.neg.noNA) == "VPAnC1"):ncol(vpa.cell.neg.noNA)
    )
# plot all abundances in a sample, grouped by sample as a boxplot
vpa.cell.neg.noNA.gathered %>% 
  ggplot(aes(Samples, Abundance, fill = Group)) +
  geom_boxplot() +
  geom_boxplot(aes(color = Group), fatten = NULL, fill = NA, coef = 0, outlier.alpha = 0, show.legend = FALSE) +
  theme(axis.text.x = element_text(angle = 90)) +
  ylab("log2(Abundance)") +
  ggtitle("Boxplot of compound abundances\nAll samples\nVPA-only / Cells / Negative Mode")

# same data format, but as ridge plots
vpa.cell.neg.noNA.gathered %>% 
  ggplot(aes(y = Samples, x = Abundance, fill = Group)) + 
  geom_density_ridges(scale = 15) +
  theme_ridges() +
  scale_y_discrete(expand = c(0.01, 0)) +
  ggtitle("Ridge plot of compound abundances\nAll samples\nVPA-only / Cells / Negative Mode")

# experimental samples only
vpa.cell.neg.noNA.gathered %>% 
  filter(Group == "sample") %>% 
  ggplot(aes(y = Samples, x = Abundance, fill = Treatment)) + 
  geom_density_ridges(scale = 10) +
  theme_ridges() +
  scale_y_discrete(expand = c(0.01, 0)) +
  ggtitle("Ridge plot of compound abundances\nExperimental samples only\nVPA-only / Cells / Negative Mode")

# overlay the distributions for another look
vpa.cell.neg.noNA.gathered %>%
  filter(Group == "sample") %>% 
  ggplot(aes(Abundance, group = Samples, color = Treatment)) +
  geom_density(alpha = 0.8, size = 0.75) +
  xlab("log2(Abundance)") +
  ggtitle("Density plot of compound abundances\nExperimental samples only\nVPA-only / Cells / Negative Mode")

# relationship with protein quant and run order?
vpa.cell.neg.noNA.gathered %>% 
  inner_join(vpa.cell.other.info, by = "Samples") %>% 
  ggplot(aes(run_order, Abundance)) +
  geom_jitter(width = 0.1, size = 1, alpha = 0.6, aes(color = Treatment)) +
  geom_smooth(method = "lm", color = "black", alpha = 0.8, se = FALSE)

vpa.cell.neg.noNA.gathered %>% 
  inner_join(vpa.cell.other.info, by = "Samples") %>% 
  ggplot(aes(prot_conc_ug_uL, Abundance)) +
  geom_jitter(width = 0.005, size = 1, alpha = 0.6, aes(color = Treatment)) +
  geom_smooth(color = "black", alpha = 0.8, se = FALSE)

3.2.2 Cells / Positive Mode

vpa.cell.pos.noNA.gathered <- vpa.cell.pos.noNA %>% 
  gather(
    key = "Metabolite", "Abundance", 
    which(colnames(vpa.cell.pos.noNA) == "VPApC1"):ncol(vpa.cell.pos.noNA)
    )
# plot all abundances in a sample, grouped by sample as a boxplot
vpa.cell.pos.noNA.gathered %>% 
  ggplot(aes(Samples, Abundance, fill = Group)) +
  geom_boxplot() +
  geom_boxplot(aes(color = Group), fatten = NULL, fill = NA, coef = 0, outlier.alpha = 0, show.legend = FALSE) +
  theme(axis.text.x = element_text(angle = 90)) +
  ylab("log2(Abundance)") +
  ggtitle("Boxplot of compound abundances\nAll samples\nVPA-only / Cells / Positive Mode")

# same data format, but as ridge plots
vpa.cell.pos.noNA.gathered %>% 
  ggplot(aes(y = Samples, x = Abundance, fill = Group)) + 
  geom_density_ridges(scale = 15) +
  theme_ridges() +
  scale_y_discrete(expand = c(0.01, 0)) +
  ggtitle("Ridge plot of compound abundances\nAll samples\nVPA-only / Cells / Positive Mode")

# experimental samples only
vpa.cell.pos.noNA.gathered %>% 
  filter(Group == "sample") %>% 
  ggplot(aes(y = Samples, x = Abundance, fill = Treatment)) + 
  geom_density_ridges(scale = 10) +
  theme_ridges() +
  scale_y_discrete(expand = c(0.01, 0)) +
  ggtitle("Ridge plot of compound abundances\nExperimental samples only\nVPA-only / Cells / Positive Mode")

# overlay the distributions for another look
vpa.cell.pos.noNA.gathered %>%
  filter(Group == "sample") %>% 
  ggplot(aes(Abundance, group = Samples, color = Treatment)) +
  geom_density(alpha = 0.8, size = 0.75) +
  xlab("log2(Abundance)") +
  ggtitle("Density plot of compound abundances\nExperimental samples only\nVPA-only / Cells / Positive Mode")

# relationship with protein quant and run order?
vpa.cell.pos.noNA.gathered %>% 
  inner_join(vpa.cell.other.info, by = "Samples") %>% 
  ggplot(aes(run_order, Abundance)) +
  geom_jitter(width = 0.1, size = 1, alpha = 0.6, aes(color = Treatment)) +
  geom_smooth(method = "lm", color = "black", alpha = 0.8, se = FALSE)

vpa.cell.pos.noNA.gathered %>% 
  inner_join(vpa.cell.other.info, by = "Samples") %>% 
  ggplot(aes(prot_conc_ug_uL, Abundance)) +
  geom_jitter(width = 0.005, size = 1, alpha = 0.6, aes(color = Treatment)) +
  geom_smooth(color = "black", alpha = 0.8, se = FALSE)

3.2.3 Media / Negative Mode

vpa.med.neg.noNA.gathered <- vpa.med.neg.noNA %>% 
  gather(
    key = "Metabolite", "Abundance", 
    which(colnames(vpa.med.neg.noNA) == "VPAnM1"):ncol(vpa.med.neg.noNA)
    )
# plot all abundances in a sample, grouped by sample as a boxplot
vpa.med.neg.noNA.gathered %>% 
  ggplot(aes(Samples, Abundance, fill = Group)) +
  geom_boxplot() +
  geom_boxplot(aes(color = Group), fatten = NULL, fill = NA, coef = 0, outlier.alpha = 0, show.legend = FALSE) +
  theme(axis.text.x = element_text(angle = 90)) +
  ylab("log2(Abundance)") +
  ggtitle("Boxplot of compound abundances\nAll samples\nVPA-only / Media / Negative Mode")

# same data format, but as ridge plots
vpa.med.neg.noNA.gathered %>% 
  ggplot(aes(y = Samples, x = Abundance, fill = Group)) + 
  geom_density_ridges(scale = 15) +
  theme_ridges() +
  scale_y_discrete(expand = c(0.01, 0)) +
  ggtitle("Ridge plot of compound abundances\nAll samples\nVPA-only / Media / Negative Mode")

# experimental samples only
vpa.med.neg.noNA.gathered %>% 
  filter(Group == "sample") %>% 
  ggplot(aes(y = Samples, x = Abundance, fill = Treatment)) + 
  geom_density_ridges(scale = 10) +
  theme_ridges() +
  scale_y_discrete(expand = c(0.01, 0)) +
  ggtitle("Ridge plot of compound abundances\nExperimental samples only\nVPA-only / Media / Negative Mode")

# overlay the distributions for another look
vpa.med.neg.noNA.gathered %>%
  filter(Group == "sample") %>% 
  ggplot(aes(Abundance, group = Samples, color = Treatment)) +
  geom_density(alpha = 0.8, size = 0.75) +
  xlab("log2(Abundance)") +
  ggtitle("Density plot of compound abundances\nExperimental samples only\nVPA-only / Media / Negative Mode")

# relationship with protein quant and run order?
vpa.med.neg.noNA.gathered %>% 
  inner_join(vpa.cell.other.info, by = "Samples") %>% 
  ggplot(aes(run_order, Abundance)) +
  geom_jitter(width = 0.1, size = 1, alpha = 0.6, aes(color = Treatment)) +
  geom_smooth(method = "lm", color = "black", alpha = 0.8, se = FALSE)

vpa.med.neg.noNA.gathered %>% 
  inner_join(vpa.cell.other.info, by = "Samples") %>% 
  ggplot(aes(prot_conc_ug_uL, Abundance)) +
  geom_jitter(width = 0.005, size = 1, alpha = 0.6, aes(color = Treatment)) +
  geom_smooth(color = "black", alpha = 0.8, se = FALSE)

3.2.4 Media / Positive Mode

vpa.med.pos.noNA.gathered <- vpa.med.pos.noNA %>% 
  gather(
    key = "Metabolite", "Abundance", 
    which(colnames(vpa.med.pos.noNA) == "VPApM1"):ncol(vpa.med.pos.noNA)
    )
# plot all abundances in a sample, grouped by sample as a boxplot
vpa.med.pos.noNA.gathered %>% 
  ggplot(aes(Samples, Abundance, fill = Group)) +
  geom_boxplot() +
  geom_boxplot(aes(color = Group), fatten = NULL, fill = NA, coef = 0, outlier.alpha = 0, show.legend = FALSE) +
  theme(axis.text.x = element_text(angle = 90)) +
  ylab("log2(Abundance)") +
  ggtitle("Boxplot of compound abundances\nAll samples\nVPA-only / Media / Positive Mode")

# same data format, but as ridge plots
vpa.med.pos.noNA.gathered %>% 
  ggplot(aes(y = Samples, x = Abundance, fill = Group)) + 
  geom_density_ridges(scale = 15) +
  theme_ridges() +
  scale_y_discrete(expand = c(0.01, 0)) +
  ggtitle("Ridge plot of compound abundances\nAll samples\nVPA-only / Media / Positive Mode")

# experimental samples only
vpa.med.pos.noNA.gathered %>% 
  filter(Group == "sample") %>% 
  ggplot(aes(y = Samples, x = Abundance, fill = Treatment)) + 
  geom_density_ridges(scale = 10) +
  theme_ridges() +
  scale_y_discrete(expand = c(0.01, 0)) +
  ggtitle("Ridge plot of compound abundances\nExperimental samples only\nVPA-only / Media / Positive Mode")

# overlay the distributions for another look
vpa.med.pos.noNA.gathered %>%
  filter(Group == "sample") %>% 
  ggplot(aes(Abundance, group = Samples, color = Treatment)) +
  geom_density(alpha = 0.8, size = 0.75) +
  xlab("log2(Abundance)") +
  ggtitle("Density plot of compound abundances\nExperimental samples only\nVPA-only / Media / Positive Mode")

vpa.med.pos.noNA.gathered %>% 
  inner_join(vpa.cell.other.info, by = "Samples") %>% 
  ggplot(aes(run_order, Abundance)) +
  geom_jitter(width = 0.1, size = 1, alpha = 0.6, aes(color = Treatment)) +
  geom_smooth(method = "lm", color = "black", alpha = 0.8, se = FALSE)

vpa.med.pos.noNA.gathered %>% 
  inner_join(vpa.cell.other.info, by = "Samples") %>% 
  ggplot(aes(prot_conc_ug_uL, Abundance)) +
  geom_jitter(width = 0.005, size = 1, alpha = 0.6, aes(color = Treatment)) +
  geom_smooth(color = "black", alpha = 0.8, se = FALSE)

3.3 Principal Component Analysis

Some plots to understand the relationship between the samples, QC samples, solvent, and empty samples in some cases.

3.3.1 Cells / Negative Mode

### PCA on all Samples ###
vpa.cell.neg.full.pca <- vpa.cell.neg.noNA %>% 
  select(starts_with("VPAnC")) %>% 
  # good idea to center data before pca, but scaling should not be necessary
  mutate_all(scale, center = TRUE, scale = FALSE) %>% 
  as.matrix() %>% 
  prcomp()
# plot variance explained by each new principal component
plot(
  (vpa.cell.neg.full.pca$sdev ^ 2) * 100 / sum(vpa.cell.neg.full.pca$sdev ^ 2), 
  xlab = "Principal Component",
  ylab = "Variance Explained",
  main = "Percent variance explained by each principal component\nAll samples only\nVPA-only / Cells / Negative Mode",
  type = "b"
  )

vpa.cell.neg.full.pca.x <- as.data.frame(vpa.cell.neg.full.pca$x)
row.names(vpa.cell.neg.full.pca.x) <- vpa.cell.neg.noNA$Samples
vpa.cell.neg.full.pca.x <- vpa.cell.neg.full.pca.x %>% 
  bind_cols(vpa.cell.neg.noNA %>% select(Group:Experiment))
vpa.cell.neg.full.pca.x %>% 
  ggplot(aes(x = PC1, y = PC2, color = Group)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC1 (90.5% Var)") +
  ylab("PC2 (3.1%)") +
  ggtitle("Principal Component Analysis\nAll Samples\nVPA-only / Cells / Negative Mode")

### Samples and QC ###
vpa.cell.neg.smpl.QC.pca <- vpa.cell.neg.noNA %>% 
  filter(Group == "sample" | Group == "QC") %>% 
  select(starts_with("VPAnC")) %>% 
  mutate_all(scale, center = TRUE, scale = FALSE) %>% 
  as.matrix() %>% 
  prcomp()
plot(
  (vpa.cell.neg.smpl.QC.pca$sdev ^ 2) * 100 / sum(vpa.cell.neg.smpl.QC.pca$sdev ^ 2), 
  xlab = "Principal Component",
  ylab = "Variance Explained",
  main = "Percent variance explained by each principal component\nSamples and QC\nVPA-only / Cells / Negative Mode",
  type = "b"
  )

vpa.cell.neg.smpl.QC.pca.x <- as.data.frame(vpa.cell.neg.smpl.QC.pca$x)
vpa.cell.neg.smpl.QC.pca.x <- vpa.cell.neg.smpl.QC.pca.x %>% 
  bind_cols(
    vpa.cell.neg.noNA %>% 
      filter(Group == "sample" | Group == "QC") %>% 
      select(Samples, Group:Experiment)
  )
row.names(vpa.cell.neg.smpl.QC.pca.x) <- vpa.cell.neg.smpl.QC.pca.x$Samples
vpa.cell.neg.smpl.QC.pca.x %>% 
  ggplot(aes(x = PC1, y = PC2, color = Treatment)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC1 (35.1% Var)") +
  ylab("PC2 (19.7%)") +
  ggtitle("Principal Component Analysis\nSamples and QC\nVPA-only / Cells / Negative Mode")

vpa.cell.neg.smpl.QC.pca.x %>% 
  ggplot(aes(x = PC3, y = PC4, color = Treatment)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC3 (16.1% Var)") +
  ylab("PC4 (6.4%)") +
  ggtitle("Principal Component Analysis\nSamples and QC\nVPA-only / Cells / Negative Mode")

### Experimental Samples Only ###
vpa.cell.neg.smpl.pca <- vpa.cell.neg.noNA %>% 
  filter(Group == "sample") %>% 
  select(starts_with("VPAnC")) %>% 
  mutate_all(scale, center = TRUE, scale = FALSE) %>% 
  as.matrix() %>% 
  prcomp()
plot(
  (vpa.cell.neg.smpl.pca$sdev ^ 2) * 100 / sum(vpa.cell.neg.smpl.pca$sdev ^ 2), 
  xlab = "Principal Component",
  ylab = "Variance Explained",
  main = "Percent variance explained by each principal component\nExperimental samples only\nVPA-only / Cells / Negative Mode",
  type = "b"
  )

vpa.cell.neg.smpl.pca.x <- as.data.frame(vpa.cell.neg.smpl.pca$x)
vpa.cell.neg.smpl.pca.x <- vpa.cell.neg.smpl.pca.x %>% 
  bind_cols(
    vpa.cell.neg.noNA %>% 
      filter(Group == "sample") %>% 
      select(Samples, Group:Experiment)
  )
row.names(vpa.cell.neg.smpl.pca.x) <- vpa.cell.neg.smpl.pca.x$Samples
vpa.cell.neg.smpl.pca.x %>% 
  ggplot(aes(x = PC1, y = PC2, color = Treatment)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC1 (34.3% Var)") +
  ylab("PC2 (23.8%)") +
  ggtitle("Principal Component Analysis\nExperimental samples only\nVPA-only / Cells / Negative Mode")

vpa.cell.neg.smpl.pca.x %>% 
  ggplot(aes(x = PC3, y = PC4, color = Treatment)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC3 (16.6% Var)") +
  ylab("PC4 (7.1%)") +
  ggtitle("Principal Component Analysis\nExperimental samples only\nVPA-only / Cells / Negative Mode")

3.3.2 Cells / Positive Mode

### PCA on all Samples ###
vpa.cell.pos.full.pca <- vpa.cell.pos.noNA %>% 
  select(starts_with("VPApC")) %>% 
  # good idea to center data before pca, but scaling should not be necessary
  mutate_all(scale, center = TRUE, scale = FALSE) %>% 
  as.matrix() %>% 
  prcomp()
# plot variance explained by each new principal component
plot(
  (vpa.cell.pos.full.pca$sdev ^ 2) * 100 / sum(vpa.cell.pos.full.pca$sdev ^ 2), 
  xlab = "Principal Component",
  ylab = "Variance Explained",
  main = "Percent variance explained by each principal component\nAll samples only\nVPA-only / Cells / Positive Mode",
  type = "b"
  )

vpa.cell.pos.full.pca.x <- as.data.frame(vpa.cell.pos.full.pca$x)
row.names(vpa.cell.pos.full.pca.x) <- vpa.cell.pos.noNA$Samples
vpa.cell.pos.full.pca.x <- vpa.cell.pos.full.pca.x %>% 
  bind_cols(vpa.cell.pos.noNA %>% select(Group:Experiment))
vpa.cell.pos.full.pca.x %>% 
  ggplot(aes(x = PC1, y = PC2, color = Group)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC1 (90.0% Var)") +
  ylab("PC2 (4.2%)") +
  ggtitle("Principal Component Analysis\nAll Samples\nVPA-only / Cells / Positive Mode")

### Samples and QC ###
vpa.cell.pos.smpl.QC.pca <- vpa.cell.pos.noNA %>% 
  filter(Group == "sample" | Group == "QC") %>% 
  select(starts_with("VPApC")) %>% 
  mutate_all(scale, center = TRUE, scale = FALSE) %>% 
  as.matrix() %>% 
  prcomp()
plot(
  (vpa.cell.pos.smpl.QC.pca$sdev ^ 2) * 100 / sum(vpa.cell.pos.smpl.QC.pca$sdev ^ 2), 
  xlab = "Principal Component",
  ylab = "Variance Explained",
  main = "Percent variance explained by each principal component\nSamples and QC\nVPA-only / Cells / Positive Mode",
  type = "b"
  )

vpa.cell.pos.smpl.QC.pca.x <- as.data.frame(vpa.cell.pos.smpl.QC.pca$x)
vpa.cell.pos.smpl.QC.pca.x <- vpa.cell.pos.smpl.QC.pca.x %>% 
  bind_cols(
    vpa.cell.pos.noNA %>% 
      filter(Group == "sample" | Group == "QC") %>% 
      select(Samples, Group:Experiment)
  )
row.names(vpa.cell.pos.smpl.QC.pca.x) <- vpa.cell.pos.smpl.QC.pca.x$Samples
vpa.cell.pos.smpl.QC.pca.x %>% 
  ggplot(aes(x = PC1, y = PC2, color = Treatment)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC1 (55.0% Var)") +
  ylab("PC2 (15.0%)") +
  ggtitle("Principal Component Analysis\nSamples and QC\nVPA-only / Cells / Positive Mode")

vpa.cell.pos.smpl.QC.pca.x %>% 
  ggplot(aes(x = PC3, y = PC4, color = Treatment)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC3 (11.4% Var)") +
  ylab("PC4 (5.0%)") +
  ggtitle("Principal Component Analysis\nSamples and QC\nVPA-only / Cells / Positive Mode")

### Experimental Samples Only ###
vpa.cell.pos.smpl.pca <- vpa.cell.pos.noNA %>% 
  filter(Group == "sample") %>% 
  select(starts_with("VPApC")) %>% 
  mutate_all(scale, center = TRUE, scale = FALSE) %>% 
  as.matrix() %>% 
  prcomp()
plot(
  (vpa.cell.pos.smpl.pca$sdev ^ 2) * 100 / sum(vpa.cell.pos.smpl.pca$sdev ^ 2), 
  xlab = "Principal Component",
  ylab = "Variance Explained",
  main = "Percent variance explained by each principal component\nExperimental samples only\nVPA-only / Cells / Positive Mode",
  type = "b"
  )

vpa.cell.pos.smpl.pca.x <- as.data.frame(vpa.cell.pos.smpl.pca$x)
vpa.cell.pos.smpl.pca.x <- vpa.cell.pos.smpl.pca.x %>% 
  bind_cols(
    vpa.cell.pos.noNA %>% 
      filter(Group == "sample") %>% 
      select(Samples, Group:Experiment)
  )
row.names(vpa.cell.pos.smpl.pca.x) <- vpa.cell.pos.smpl.pca.x$Samples
vpa.cell.pos.smpl.pca.x %>% 
  ggplot(aes(x = PC1, y = PC2, color = Treatment)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC1 (57.5% Var)") +
  ylab("PC2 (19.6%)") +
  ggtitle("Principal Component Analysis\nExperimental samples only\nVPA-only / Cells / Positive Mode")

vpa.cell.pos.smpl.pca.x %>% 
  ggplot(aes(x = PC3, y = PC4, color = Treatment)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC3 (7.2% Var)") +
  ylab("PC4 (4.7%)") +
  ggtitle("Principal Component Analysis\nExperimental samples only\nVPA-only / Cells / Positive Mode")

3.3.3 Media / Negative Mode

### PCA on all Samples ###
vpa.med.neg.full.pca <- vpa.med.neg.noNA %>% 
  select(starts_with("VPAnM")) %>% 
  # good idea to center data before pca, but scaling should not be necessary
  mutate_all(scale, center = TRUE, scale = FALSE) %>% 
  as.matrix() %>% 
  prcomp()
# plot variance explained by each new principal component
plot(
  (vpa.med.neg.full.pca$sdev ^ 2) * 100 / sum(vpa.med.neg.full.pca$sdev ^ 2), 
  xlab = "Principal Component",
  ylab = "Variance Explained",
  main = "Percent variance explained by each principal component\nAll samples only\nVPA-only / Media / Negative Mode",
  type = "b"
  )

vpa.med.neg.full.pca.x <- as.data.frame(vpa.med.neg.full.pca$x)
row.names(vpa.med.neg.full.pca.x) <- vpa.med.neg.noNA$Samples
vpa.med.neg.full.pca.x <- vpa.med.neg.full.pca.x %>% 
  bind_cols(vpa.med.neg.noNA %>% select(Group:Experiment))
vpa.med.neg.full.pca.x %>% 
  ggplot(aes(x = PC1, y = PC2, color = Group, shape = Treatment)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC1 (75.8% Var)") +
  ylab("PC2 (8.4%)") +
  ggtitle("Principal Component Analysis\nAll Samples\nVPA-only / Media / Negative Mode")

### Samples and QC ###
vpa.med.neg.smpl.QC.pca <- vpa.med.neg.noNA %>% 
  filter(Group == "sample" | Group == "QC") %>% 
  select(starts_with("VPAnM")) %>% 
  mutate_all(scale, center = TRUE, scale = FALSE) %>% 
  as.matrix() %>% 
  prcomp()
plot(
  (vpa.med.neg.smpl.QC.pca$sdev ^ 2) * 100 / sum(vpa.med.neg.smpl.QC.pca$sdev ^ 2), 
  xlab = "Principal Component",
  ylab = "Variance Explained",
  main = "Percent variance explained by each principal component\nSamples and QC\nVPA-only / Media / Negative Mode",
  type = "b"
  )

vpa.med.neg.smpl.QC.pca.x <- as.data.frame(vpa.med.neg.smpl.QC.pca$x)
vpa.med.neg.smpl.QC.pca.x <- vpa.med.neg.smpl.QC.pca.x %>% 
  bind_cols(
    vpa.med.neg.noNA %>% 
      filter(Group == "sample" | Group == "QC") %>% 
      select(Samples, Group:Experiment)
  )
row.names(vpa.med.neg.smpl.QC.pca.x) <- vpa.med.neg.smpl.QC.pca.x$Samples
vpa.med.neg.smpl.QC.pca.x %>% 
  ggplot(aes(x = PC1, y = PC2, color = Treatment)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC1 (56.8% Var)") +
  ylab("PC2 (25.0%)") +
  ggtitle("Principal Component Analysis\nSamples and QC\nVPA-only / Media / Negative Mode")

vpa.med.neg.smpl.QC.pca.x %>% 
  ggplot(aes(x = PC3, y = PC4, color = Treatment)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC3 (9.0% Var)") +
  ylab("PC4 (4.1%)") +
  ggtitle("Principal Component Analysis\nSamples and QC\nVPA-only / Media / Negative Mode")

### Experimental Samples Only ###
vpa.med.neg.smpl.pca <- vpa.med.neg.noNA %>% 
  filter(Group == "sample") %>% 
  select(starts_with("VPAnM")) %>% 
  mutate_all(scale, center = TRUE, scale = FALSE) %>% 
  as.matrix() %>% 
  prcomp()
plot(
  (vpa.med.neg.smpl.pca$sdev ^ 2) * 100 / sum(vpa.med.neg.smpl.pca$sdev ^ 2), 
  xlab = "Principal Component",
  ylab = "Variance Explained",
  main = "Percent variance explained by each principal component\nExperimental samples only\nVPA-only / Media / Negative Mode",
  type = "b"
  )

vpa.med.neg.smpl.pca.x <- as.data.frame(vpa.med.neg.smpl.pca$x)
vpa.med.neg.smpl.pca.x <- vpa.med.neg.smpl.pca.x %>% 
  bind_cols(
    vpa.med.neg.noNA %>% 
      filter(Group == "sample") %>% 
      select(Samples, Group:Experiment)
  )
row.names(vpa.med.neg.smpl.pca.x) <- vpa.med.neg.smpl.pca.x$Samples
vpa.med.neg.smpl.pca.x %>% 
  ggplot(aes(x = PC1, y = PC2, color = Treatment)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC1 (58.2% Var)") +
  ylab("PC2 (27.5%)") +
  ggtitle("Principal Component Analysis\nExperimental samples only\nVPA-only / Media / Negative Mode")

vpa.med.neg.smpl.pca.x %>% 
  ggplot(aes(x = PC3, y = PC4, color = Treatment, shape = Experiment)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC3 (8.3% Var)") +
  ylab("PC4 (2.4%)") +
  ggtitle("Principal Component Analysis\nExperimental samples only\nVPA-only / Media / Negative Mode")

3.3.4 Media / Positive Mode

### PCA on all Samples ###
vpa.med.pos.full.pca <- vpa.med.pos.noNA %>% 
  select(starts_with("VPApM")) %>% 
  # good idea to center data before pca, but scaling should not be necessary
  mutate_all(scale, center = TRUE, scale = FALSE) %>% 
  as.matrix() %>% 
  prcomp()
# plot variance explained by each new principal component
plot(
  (vpa.med.pos.full.pca$sdev ^ 2) * 100 / sum(vpa.med.pos.full.pca$sdev ^ 2), 
  xlab = "Principal Component",
  ylab = "Variance Explained",
  main = "Percent variance explained by each principal component\nAll samples only\nVPA-only / Media / Positive Mode",
  type = "b"
  )

vpa.med.pos.full.pca.x <- as.data.frame(vpa.med.pos.full.pca$x)
row.names(vpa.med.pos.full.pca.x) <- vpa.med.pos.noNA$Samples
vpa.med.pos.full.pca.x <- vpa.med.pos.full.pca.x %>% 
  bind_cols(vpa.med.pos.noNA %>% select(Group:Experiment))
vpa.med.pos.full.pca.x %>% 
  ggplot(aes(x = PC1, y = PC2, color = Group, shape = Treatment)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC1 (85.6% Var)") +
  ylab("PC2 (6.3%)") +
  ggtitle("Principal Component Analysis\nAll Samples\nVPA-only / Media / Positive Mode")

### Samples and QC ###
vpa.med.pos.smpl.QC.pca <- vpa.med.pos.noNA %>% 
  filter(Group == "sample" | Group == "QC") %>% 
  select(starts_with("VPApM")) %>% 
  mutate_all(scale, center = TRUE, scale = FALSE) %>% 
  as.matrix() %>% 
  prcomp()
plot(
  (vpa.med.pos.smpl.QC.pca$sdev ^ 2) * 100 / sum(vpa.med.pos.smpl.QC.pca$sdev ^ 2), 
  xlab = "Principal Component",
  ylab = "Variance Explained",
  main = "Percent variance explained by each principal component\nSamples and QC\nVPA-only / Media / Positive Mode",
  type = "b"
  )

vpa.med.pos.smpl.QC.pca.x <- as.data.frame(vpa.med.pos.smpl.QC.pca$x)
vpa.med.pos.smpl.QC.pca.x <- vpa.med.pos.smpl.QC.pca.x %>% 
  bind_cols(
    vpa.med.pos.noNA %>% 
      filter(Group == "sample" | Group == "QC") %>% 
      select(Samples, Group:Experiment)
  )
row.names(vpa.med.pos.smpl.QC.pca.x) <- vpa.med.pos.smpl.QC.pca.x$Samples
vpa.med.pos.smpl.QC.pca.x %>% 
  ggplot(aes(x = PC1, y = PC2, color = Treatment)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC1 (72.1% Var)") +
  ylab("PC2 (15.6%)") +
  ggtitle("Principal Component Analysis\nSamples and QC\nVPA-only / Media / Positive Mode")

vpa.med.pos.smpl.QC.pca.x %>% 
  ggplot(aes(x = PC3, y = PC4, color = Treatment)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC3 (6.0% Var)") +
  ylab("PC4 (2.5%)") +
  ggtitle("Principal Component Analysis\nSamples and QC\nVPA-only / Media / Positive Mode")

### Experimental Samples Only ###
vpa.med.pos.smpl.pca <- vpa.med.pos.noNA %>% 
  filter(Group == "sample") %>% 
  select(starts_with("VPApM")) %>% 
  mutate_all(scale, center = TRUE, scale = FALSE) %>% 
  as.matrix() %>% 
  prcomp()
plot(
  (vpa.med.pos.smpl.pca$sdev ^ 2) * 100 / sum(vpa.med.pos.smpl.pca$sdev ^ 2), 
  xlab = "Principal Component",
  ylab = "Variance Explained",
  main = "Percent variance explained by each principal component\nExperimental samples only\nVPA-only / Media / Positive Mode",
  type = "b"
  )

vpa.med.pos.smpl.pca.x <- as.data.frame(vpa.med.pos.smpl.pca$x)
vpa.med.pos.smpl.pca.x <- vpa.med.pos.smpl.pca.x %>% 
  bind_cols(
    vpa.med.pos.noNA %>% 
      filter(Group == "sample") %>% 
      select(Samples, Group:Experiment)
  )
row.names(vpa.med.pos.smpl.pca.x) <- vpa.med.pos.smpl.pca.x$Samples
vpa.med.pos.smpl.pca.x %>% 
  ggplot(aes(x = PC1, y = PC2, color = Treatment)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC1 (80.9% Var)") +
  ylab("PC2 (11.2%)") +
  ggtitle("Principal Component Analysis\nExperimental samples only\nVPA-only / Media / Positive Mode")

4 Batch Effects and Signifiance Testing

4.1 VPA metabolism

Is VPA metabolised by cells?

vpa.med.neg.noNA %>% 
  select(Samples:Experiment, VPAnM24) %>% 
  unite(Sample_Type, Group:Treatment, sep = "_") %>% 
  ggplot(aes(Sample_Type, VPAnM24)) +
  geom_jitter(alpha = 0.8, width = 0.1)

vpa.med.neg.noNA %>% 
  select(Samples:Experiment, VPAnM24) %>% 
  unite(Sample_Type, Group:Treatment, sep = "_") %>% 
  ggplot(aes(Sample_Type, VPAnM24)) +
  geom_boxplot()

vpa.med.neg.noNA %>% 
  select(Samples:Experiment, VPAnM24) %>% 
  filter(Treatment == "vpa") %>% 
  group_by(Group) %>% 
  summarize(
    vpa.avg = mean(VPAnM24),
    vpa.sd = sd(VPAnM24)
    )
# A tibble: 2 x 3
  Group  vpa.avg vpa.sd
  <chr>    <dbl>  <dbl>
1 empty     20.8  0.413
2 sample    20.8  0.506

It seems likely that VPA is not getting metabolised to a great extent, but cannot be sure.

4.2 Cells / Negative Mode

# sample prep
vpa.cell.neg.smpl.data <- vpa.cell.neg.noNA %>% 
    filter(Group == "sample")
vpa.cell.neg.data.for.sva <- as.matrix(
  vpa.cell.neg.smpl.data[, which(
    colnames(vpa.cell.neg.smpl.data) == "VPAnC1"
    ):ncol(vpa.cell.neg.smpl.data)]
  )
row.names(vpa.cell.neg.data.for.sva) <- vpa.cell.neg.smpl.data$Samples
vpa.cell.neg.data.for.sva <- t(vpa.cell.neg.data.for.sva)
# pheno prep
vpa.cell.neg.data.pheno <- as.data.frame(vpa.cell.neg.smpl.data[, 5:7])
row.names(vpa.cell.neg.data.pheno) <- vpa.cell.neg.smpl.data$Samples
# sva calculation
vpa.cell.neg.mod.vpa <- model.matrix(~ as.factor(Treatment), data = vpa.cell.neg.data.pheno)
vpa.cell.neg.mod0 <- model.matrix(~ 1, data = vpa.cell.neg.data.pheno)
set.seed(2018)
num.sv(vpa.cell.neg.data.for.sva, vpa.cell.neg.mod.vpa, method = "be")
[1] 3
set.seed(2018)
num.sv(vpa.cell.neg.data.for.sva, vpa.cell.neg.mod.vpa, method = "leek")
[1] 0
set.seed(2018)
vpa.cell.neg.sv <- sva(vpa.cell.neg.data.for.sva, vpa.cell.neg.mod.vpa, vpa.cell.neg.mod0)
Number of significant surrogate variables is:  3 
Iteration (out of 5 ):1  2  3  4  5  
# extract the surrogate variables
vpa.cell.neg.surr.var <- as.data.frame(vpa.cell.neg.sv$sv)
colnames(vpa.cell.neg.surr.var) <- c("S1", "S2", "S3")


vpa.cell.neg.covar <- vpa.cell.neg.smpl.pca.x %>% 
  select(Samples, PC1:PC4) %>% 
  inner_join(
    cbind(vpa.cell.neg.data.pheno, vpa.cell.neg.surr.var) %>% 
      rownames_to_column("Samples"),
    by = "Samples"
    ) %>% 
  full_join(vpa.cell.other.info, by = "Samples") %>% 
  as_tibble()

vpa.cell.neg.covar %>% 
  unite("exp_plate", Experiment, Plate, sep = "_") %>% 
  select(Samples, exp_plate:S3) %>% 
  gather("surr_var", "value", S1:S3) %>% 
  ggplot(aes(exp_plate, value, color = exp_plate)) +
  geom_boxplot() +
  geom_jitter(size = 2, alpha = 0.8, width = 0.1) +
  facet_wrap(~ surr_var)

vpa.cell.neg.covar %>% 
  unite("exp_plate", Experiment, Plate, sep = "_") %>% 
  select(Samples, Treatment:S3) %>% 
  gather("surr_var", "value", S1:S3) %>% 
  ggplot(aes(exp_plate, value)) +
  geom_boxplot() +
  geom_jitter(size = 2, alpha = 0.8, width = 0.1, aes(color = Treatment)) +
  facet_wrap(~ surr_var)

vpa.cell.neg.covar %>% 
  unite("exp_plate", Experiment, Plate, sep = "_") %>% 
  select(Samples:exp_plate) %>% 
  gather("PC", "value", PC1:PC4) %>% 
  ggplot(aes(exp_plate, value)) +
  geom_boxplot() +
  geom_jitter(size = 2, alpha = 0.8, width = 0.1, aes(color = Treatment)) +
  facet_wrap(~ PC, scales = "free")

vpa.cell.neg.covar %>% 
  select(PC1:PC4, S1:S3) %>% 
  ggcorr(label = TRUE)

vpa.cell.neg.covar %>% 
  select(PC1:PC4, S1:S3) %>% 
  ggpairs()

vpa.cell.neg.covar %>% 
  ggplot(aes(S3)) +
  geom_histogram(bins = 50)

vpa.cell.neg.covar %>% 
  select(PC1:PC4, prot_conc_ug_uL:run_order) %>% 
  ggcorr(label = TRUE)

vpa.cell.neg.covar %>% 
  select(PC1:PC4, prot_conc_ug_uL:run_order) %>% 
  ggpairs()

vpa.cell.neg.covar %>% 
  ggplot(aes(run_order, PC2, color = Treatment, shape = Experiment)) +
  geom_point(size = 3, alpha = 0.8)

vpa.cell.neg.covar %>% 
  ggplot(aes(prot_conc_ug_uL, PC2, color = Treatment, shape = Experiment)) +
  geom_point(size = 3, alpha = 0.8)

vpa.cell.neg.covar %>% 
  select(S1:run_order) %>% 
  ggcorr(label = TRUE)

vpa.cell.neg.covar %>% 
  select(S1:run_order) %>% 
  ggpairs()

vpa.cell.neg.covar %>% 
  ggplot(aes(run_order, S1, color = Treatment, shape = Experiment)) +
  geom_point(size = 3, alpha = 0.8)

vpa.cell.neg.covar %>% 
  ggplot(aes(run_order, S2, color = Treatment, shape = Experiment)) +
  geom_point(size = 3, alpha = 0.8)

vpa.cell.neg.covar %>% 
  ggplot(aes(prot_conc_ug_uL, S2, color = Treatment, shape = Experiment)) +
  geom_point(size = 3, alpha = 0.8)

vpa.cell.neg.covar %>% 
  ggplot(aes(run_order, prot_conc_ug_uL, color = Treatment, shape = Experiment)) +
  geom_point(size = 3, alpha = 0.8)

colnames(vpa.cell.neg.mod.vpa) <- c("cntrl", "DRUGvsCNTRL")
# combine the full model matrix and the surrogate variables into one
vpa.cell.neg.d.sv <- cbind(vpa.cell.neg.mod.vpa, vpa.cell.neg.surr.var[, 1:2])  
head(vpa.cell.neg.d.sv)
         cntrl DRUGvsCNTRL          S1          S2
T1_P1_C1     1           0 -0.02572670  0.30588286
T1_P1_C2     1           0  0.22693416 -0.04248658
T1_P1_C3     1           0  0.24683594  0.01195802
T1_P1_V1     1           1  0.26304093  0.05597521
T1_P1_V2     1           1 -0.08024336  0.19758531
T1_P1_V3     1           1 -0.08965598  0.26184963
vpa.cell.neg.top.table <- vpa.cell.neg.data.for.sva %>% 
  # fit a linear model 
  lmFit(vpa.cell.neg.d.sv) %>% 
  # calculate the test statistics
  eBayes() %>% 
  # select the top features that have a p-value < 0.05 after Bonferroni multiple hypothesis correction
  topTable(coef = "DRUGvsCNTRL", adjust = "bonferroni", p.value = 0.05, n = nrow(vpa.cell.neg.data.for.sva))
# what the result looks like:
head(vpa.cell.neg.top.table)  
              logFC  AveExpr          t      P.Value    adj.P.Val
VPAnC119 -1.1117931 16.34932 -16.036945 2.285612e-13 4.091246e-11
VPAnC114  0.9324087 15.08073  11.000854 3.019527e-10 5.404953e-08
VPAnC152 -1.3070183 16.93098 -10.520550 6.782678e-10 1.214099e-07
VPAnC148 -0.6677076 14.87538  -7.893929 9.235897e-08 1.653226e-05
VPAnC18  -0.6065030 21.20349  -7.811684 1.092767e-07 1.956052e-05
VPAnC44   0.5297000 22.56006   7.496710 2.098579e-07 3.756456e-05
                 B
VPAnC119 20.769050
VPAnC114 13.612185
VPAnC152 12.796738
VPAnC148  7.826527
VPAnC18   7.656136
VPAnC44   6.995121
# make result more informative
vpa.cell.neg.top.w.info <- vpa.cell.neg.top.table %>% 
  rownames_to_column("compound_short") %>% 
  mutate(
    vpa_div_cntrl = 2 ^ logFC,
    change_in_vpa = ifelse(vpa_div_cntrl < 1, "down", "up")
    ) %>% 
  inner_join(vpa.cell.neg.compound.info, by = "compound_short") %>% 
  filter(vpa_div_cntrl > 1.2 | vpa_div_cntrl < 0.83) %>%  
  arrange(change_in_vpa, desc(vpa_div_cntrl))
vpa.cell.neg.top.w.info %>% 
  select(compound_short, compound_full, change_in_vpa, vpa_div_cntrl)
   compound_short                         compound_full change_in_vpa
1         VPAnC58                        N-Acetylserine          down
2         VPAnC65                              Xanthine          down
3         VPAnC97                          myo-Inositol          down
4        VPAnC105               N-alpha-Acetylglutamine          down
5         VPAnC41                            Asparagine          down
6         VPAnC23                           Thiosulfate          down
7         VPAnC46                           Deoxyribose          down
8        VPAnC135              Glycerol 1-phosphoserine          down
9         VPAnC86                                   G3P          down
10        VPAnC18                         Glyceric Acid          down
11       VPAnC148             Sedoheptulose 7-phosphate          down
12       VPAnC136                 D-Glucose 6-phosphate          down
13       VPAnC172                      Succinoadenosine          down
14       VPAnC144                            Xanthosine          down
15       VPAnC119        Glycerylphosphorylethanolamine          down
16        VPAnC10                           Lactic Acid          down
17       VPAnC176                                  dTDP          down
18       VPAnC159                                   CMP          down
19       VPAnC127                    Ribose 5-Phosphate          down
20       VPAnC152                             GlcNAc-1P          down
21        VPAnC48                               Adenine          down
22       VPAnC160                                   UMP          down
23       VPAnC138                               Inosine          down
24       VPAnC168                                   AMP          down
25       VPAnC169                                   GMP          down
26       VPAnC114                       Homocitric Acid            up
27        VPAnC54                         Valproic acid            up
28       VPAnC200                           ADP-Glucose            up
29        VPAnC53                 O-Phosphoethanolamine            up
30       VPAnC157 11Z,14Z-Eicosadienoic Acid (20:2 n-6)            up
31       VPAnC189                                   CTP            up
32       VPAnC194                                   GTP            up
33        VPAnC67                      3-Sulfinoalanine            up
34       VPAnC193                                   ATP            up
35        VPAnC73                    2-Aminoadipic Acid            up
36        VPAnC44                         Aspartic Acid            up
37       VPAnC124                         Cystathionine            up
38       VPAnC190                                   UTP            up
39        VPAnC17                                Serine            up
40        VPAnC30                             Threonine            up
41         VPAnC2                               Glycine            up
   vpa_div_cntrl
1      0.8006513
2      0.7733022
3      0.7727024
4      0.7642022
5      0.7475017
6      0.7258479
7      0.7200514
8      0.6898581
9      0.6742282
10     0.6567868
11     0.6295061
12     0.6155587
13     0.5263833
14     0.4899408
15     0.4627186
16     0.4458119
17     0.4260245
18     0.4086860
19     0.4066015
20     0.4041553
21     0.3971414
22     0.3471526
23     0.3070139
24     0.1705093
25     0.1357460
26     1.9084597
27     1.8537394
28     1.5939533
29     1.5623231
30     1.5330869
31     1.5075878
32     1.4845507
33     1.4760588
34     1.4597931
35     1.4558662
36     1.4436289
37     1.4029416
38     1.3951522
39     1.2483656
40     1.2386250
41     1.2121890
#write_csv(path = "./results/vpa_cell_neg_top_hits_w_FC_pval.csv", vpa.cell.neg.top.w.info)
vpa.cell.neg.gathered <- vpa.cell.neg.noNA %>%
  filter(Group == "sample") %>% 
  bind_cols(vpa.cell.neg.surr.var) %>% 
  select(Samples, Treatment, S1:S2, starts_with("VPAnC")) %>% 
  gather(key = "Compound", value = "Abundance", VPAnC1:VPAnC99)
# structure so far:
glimpse(vpa.cell.neg.gathered)
Observations: 4,117
Variables: 6
$ Samples   <chr> "T1_P1_C1", "T1_P1_C2", "T1_P1_C3", "T1_P1_V1", "T1_...
$ Treatment <chr> "cntrl", "cntrl", "cntrl", "vpa", "vpa", "vpa", "cnt...
$ S1        <dbl> -0.02572670, 0.22693416, 0.24683594, 0.26304093, -0....
$ S2        <dbl> 0.30588286, -0.04248658, 0.01195802, 0.05597521, 0.1...
$ Compound  <chr> "VPAnC1", "VPAnC1", "VPAnC1", "VPAnC1", "VPAnC1", "V...
$ Abundance <dbl> 15.08795, 14.67335, 14.81717, 14.55139, 14.79476, 15...
vpa.cell.neg.nested <- vpa.cell.neg.gathered %>% 
  group_by(Compound) %>% 
  nest() %>% 
  # apply a linear model to each individual compound, as a function of the surrogate variables
  mutate(model = map(data, ~lm(Abundance ~ S1 + S2, data = .))) %>% 
  # use broom to tidy up the output
  mutate(augment_model = map(model, augment))
# result to far:
vpa.cell.neg.nested
# A tibble: 179 x 4
   Compound data              model    augment_model     
   <chr>    <list>            <list>   <list>            
 1 VPAnC1   <tibble [23 x 5]> <S3: lm> <tibble [23 x 10]>
 2 VPAnC10  <tibble [23 x 5]> <S3: lm> <tibble [23 x 10]>
 3 VPAnC100 <tibble [23 x 5]> <S3: lm> <tibble [23 x 10]>
 4 VPAnC101 <tibble [23 x 5]> <S3: lm> <tibble [23 x 10]>
 5 VPAnC102 <tibble [23 x 5]> <S3: lm> <tibble [23 x 10]>
 6 VPAnC103 <tibble [23 x 5]> <S3: lm> <tibble [23 x 10]>
 7 VPAnC104 <tibble [23 x 5]> <S3: lm> <tibble [23 x 10]>
 8 VPAnC105 <tibble [23 x 5]> <S3: lm> <tibble [23 x 10]>
 9 VPAnC106 <tibble [23 x 5]> <S3: lm> <tibble [23 x 10]>
10 VPAnC107 <tibble [23 x 5]> <S3: lm> <tibble [23 x 10]>
# ... with 169 more rows
# now to get the residuals out for each compound
vpa.cell.neg.modSV.resid <- vpa.cell.neg.nested %>% 
  unnest(data, augment_model) %>% 
  select(Samples, Treatment, Compound, .resid) %>% 
  # return to long format
  spread(Compound, .resid) 
glimpse(vpa.cell.neg.modSV.resid[, 1:5])
Observations: 23
Variables: 5
$ Samples   <chr> "T1_P1_C1", "T1_P1_C2", "T1_P1_C3", "T1_P1_V1", "T1_...
$ Treatment <chr> "cntrl", "cntrl", "cntrl", "vpa", "vpa", "vpa", "cnt...
$ VPAnC1    <dbl> 0.09809226, -0.06724331, 0.04638420, -0.24379320, -0...
$ VPAnC10   <dbl> 0.47952620, 0.79800463, 0.31534236, -1.11205044, -0....
$ VPAnC100  <dbl> 0.064964248, -0.108617277, 0.101704046, 0.098637074,...
vpa.cell.neg.modSV.resid %>% 
  select(Samples, one_of(vpa.cell.neg.top.w.info$compound_short)) %>% 
  HeatmapPrepAlt("VPAnC") %>% 
  t() %>% 
  heatmaply(
    colors = viridis(n = 10, option = "magma"), 
    xlab = "Samples", ylab = "Compounds",
    main = "Statistically significant compounds\nVPA-Only Experiment / Cells / Negative Mode",
    margins = c(50, 50, 75, 30),
    labRow = vpa.cell.neg.top.w.info$compound_full,
    k_col = 2, k_row = 2
    )
### PCA - cleaned data ###
vpa.cell.neg.modSV.pca <- vpa.cell.neg.modSV.resid %>% 
  select(starts_with("VPAnC")) %>% 
  mutate_all(scale, center = TRUE, scale = FALSE) %>% 
  as.matrix() %>% 
  prcomp()
plot(vpa.cell.neg.modSV.pca$sdev^ 2 * 100 / sum(vpa.cell.neg.modSV.pca$sdev ^ 2))

vpa.cell.neg.modSV.pca.x <- as.data.frame(vpa.cell.neg.modSV.pca$x)
row.names(vpa.cell.neg.modSV.pca.x) <- vpa.cell.neg.modSV.resid$Samples
vpa.cell.neg.modSV.pca.x <- vpa.cell.neg.modSV.pca.x %>% 
  bind_cols(vpa.cell.neg.modSV.resid %>% select(Treatment))
vpa.cell.neg.modSV.pca.x %>% 
  ggplot(aes(x = PC1, y = PC2, color = Treatment)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC1 (54.1% Var)") +
  ylab("PC2 (12.9% Var)")

vpa.cell.neg.modSV.pca.x %>% 
  ggplot(aes(x = PC3, y = PC4, color = Treatment)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC3 (8.4% Var)") +
  ylab("PC4 (6.1% Var)")

4.3 Cells / Positive Mode

vpa.cell.pos.smpl.data <- vpa.cell.pos.noNA %>% 
    filter(Group == "sample")
vpa.cell.pos.data.for.sva <- as.matrix(
  vpa.cell.pos.smpl.data[, which(
    colnames(vpa.cell.pos.smpl.data) == "VPApC1"
    ):ncol(vpa.cell.pos.smpl.data)]
  )
row.names(vpa.cell.pos.data.for.sva) <- vpa.cell.pos.smpl.data$Samples
vpa.cell.pos.data.for.sva <- t(vpa.cell.pos.data.for.sva)
vpa.cell.pos.data.pheno <- as.data.frame(vpa.cell.pos.smpl.data[, 5:7])
row.names(vpa.cell.pos.data.pheno) <- vpa.cell.pos.smpl.data$Samples
# sva calculation
vpa.cell.pos.mod.vpa <- model.matrix(~ as.factor(Treatment), data = vpa.cell.pos.data.pheno)
vpa.cell.pos.mod0 <- model.matrix(~ 1, data = vpa.cell.pos.data.pheno)
set.seed(2018)
num.sv(vpa.cell.pos.data.for.sva, vpa.cell.pos.mod.vpa, method = "be")
[1] 2
set.seed(2018)
num.sv(vpa.cell.pos.data.for.sva, vpa.cell.pos.mod.vpa, method = "leek")
[1] 2
set.seed(2018)
vpa.cell.pos.sv <- sva(vpa.cell.pos.data.for.sva, vpa.cell.pos.mod.vpa, vpa.cell.pos.mod0)
Number of significant surrogate variables is:  2 
Iteration (out of 5 ):1  2  3  4  5  
vpa.cell.pos.surr.var <- as.data.frame(vpa.cell.pos.sv$sv)
colnames(vpa.cell.pos.surr.var) <- c("S1", "S2")




vpa.cell.pos.covar <- vpa.cell.pos.smpl.pca.x %>% 
  select(Samples, PC1:PC4) %>% 
  inner_join(
    cbind(vpa.cell.pos.data.pheno, vpa.cell.pos.surr.var) %>% 
      rownames_to_column("Samples"),
    by = "Samples"
    ) %>% 
  full_join(vpa.cell.other.info, by = "Samples") %>% 
  as_tibble()

vpa.cell.pos.covar %>% 
  unite("exp_plate", Experiment, Plate, sep = "_") %>% 
  select(Samples, exp_plate:S2) %>% 
  gather("surr_var", "value", S1:S2) %>% 
  ggplot(aes(exp_plate, value, color = exp_plate)) +
  geom_boxplot() +
  geom_jitter(size = 2, alpha = 0.8, width = 0.1) +
  facet_wrap(~ surr_var)

vpa.cell.pos.covar %>% 
  unite("exp_plate", Experiment, Plate, sep = "_") %>% 
  select(Samples, Treatment:S2) %>% 
  gather("surr_var", "value", S1:S2) %>% 
  ggplot(aes(exp_plate, value)) +
  geom_boxplot() +
  geom_jitter(size = 2, alpha = 0.8, width = 0.1, aes(color = Treatment)) +
  facet_wrap(~ surr_var)

vpa.cell.pos.covar %>% 
  unite("exp_plate", Experiment, Plate, sep = "_") %>% 
  select(Samples:exp_plate) %>% 
  gather("PC", "value", PC1:PC4) %>% 
  ggplot(aes(exp_plate, value)) +
  geom_boxplot() +
  geom_jitter(size = 2, alpha = 0.8, width = 0.1, aes(color = Treatment)) +
  facet_wrap(~ PC, scales = "free")

vpa.cell.pos.covar %>% 
  select(PC1:PC4, S1:S2) %>% 
  ggcorr(label = TRUE)

vpa.cell.pos.covar %>% 
  select(PC1:PC4, S1:S2) %>% 
  ggpairs()

vpa.cell.pos.covar %>% 
  select(PC1:PC4, prot_conc_ug_uL:run_order) %>% 
  ggcorr(label = TRUE)

vpa.cell.pos.covar %>% 
  select(PC1:PC4, prot_conc_ug_uL:run_order) %>% 
  ggpairs()

vpa.cell.pos.covar %>% 
  ggplot(aes(run_order, PC2, color = Treatment, shape = Experiment)) +
  geom_point(size = 3, alpha = 0.8)

vpa.cell.pos.covar %>% 
  ggplot(aes(prot_conc_ug_uL, PC1, color = Treatment, shape = Experiment)) +
  geom_point(size = 3, alpha = 0.8)

vpa.cell.pos.covar %>% 
  ggplot(aes(prot_conc_ug_uL, PC2, color = Treatment, shape = Experiment)) +
  geom_point(size = 3, alpha = 0.8)

vpa.cell.pos.covar %>% 
  select(S1:run_order) %>% 
  ggcorr(label = TRUE)

vpa.cell.pos.covar %>% 
  select(S1:run_order) %>% 
  ggpairs()

vpa.cell.pos.covar %>% 
  ggplot(aes(run_order, S1, color = Treatment, shape = Experiment)) +
  geom_point(size = 3, alpha = 0.8)

vpa.cell.pos.covar %>% 
  ggplot(aes(run_order, S2, color = Treatment, shape = Experiment)) +
  geom_point(size = 3, alpha = 0.8)

vpa.cell.pos.covar %>% 
  ggplot(aes(prot_conc_ug_uL, S1, color = Treatment, shape = Experiment)) +
  geom_point(size = 3, alpha = 0.8)

vpa.cell.pos.covar %>% 
  ggplot(aes(prot_conc_ug_uL, S2, color = Treatment, shape = Experiment)) +
  geom_point(size = 3, alpha = 0.8)

colnames(vpa.cell.pos.mod.vpa) <- c("cntrl", "DRUGvsCNTRL")
vpa.cell.pos.d.sv <- cbind(vpa.cell.pos.mod.vpa, vpa.cell.pos.surr.var)  
head(vpa.cell.pos.d.sv)
         cntrl DRUGvsCNTRL         S1          S2
T1_P1_C1     1           0 -0.3187310  0.19917816
T1_P1_C2     1           0 -0.2279608 -0.06387899
T1_P1_C3     1           0 -0.1409242  0.01021199
T1_P1_V1     1           1 -0.2796968 -0.36213773
T1_P1_V2     1           1 -0.1999223 -0.13213535
T1_P1_V3     1           1 -0.2955297  0.16841744
vpa.cell.pos.top.table <- vpa.cell.pos.data.for.sva %>% 
  lmFit(vpa.cell.pos.d.sv) %>% 
  eBayes() %>% 
  topTable(coef = "DRUGvsCNTRL", adjust = "bonferroni", p.value = 0.05, n = nrow(vpa.cell.pos.data.for.sva))
vpa.cell.pos.top.w.info <- vpa.cell.pos.top.table %>%
  rownames_to_column("compound_short") %>% 
  mutate(
    vpa_div_cntrl = 2 ^ logFC,
    change_in_vpa = ifelse(vpa_div_cntrl < 1, "down", "up")
    ) %>% 
  inner_join(vpa.cell.pos.compound.info, by = "compound_short") %>% 
  filter(vpa_div_cntrl > 1.2 | vpa_div_cntrl < 0.83) %>%  
  arrange(change_in_vpa, desc(vpa_div_cntrl))
vpa.cell.pos.top.w.info %>% 
  select(compound_short, compound_full, change_in_vpa, vpa_div_cntrl)
   compound_short                  compound_full change_in_vpa
1         VPApC35                     Asparagine          down
2         VPApC40             N-Methylnicotinate          down
3         VPApC51                       Xanthine          down
4        VPApC101       Glycerol 1-phosphoserine          down
5         VPApC24                   Nicotinamide          down
6         VPApC49                       D-Ribose          down
7         VPApC89                         GlcNAc          down
8         VPApC67                            G3P          down
9        VPApC112                     Xanthosine          down
10        VPApC38                        Adenine          down
11        VPApC39                   Hypoxanthine          down
12       VPApC119                            CMP          down
13        VPApC14                         Uracil          down
14       VPApC134                            ADP          down
15       VPApC107                        Inosine          down
16        VPApC85 Glycerylphosphorylethanolamine          down
17        VPApC60                         Fucose          down
18        VPApC95                        Uridine          down
19        VPApC74                   D-Galactitol          down
20       VPApC120                            UMP          down
21       VPApC121                            NMN          down
22       VPApC100      Glycerol-3-phosphocholine          down
23        VPApC17                        Proline            up
24        VPApC37                  Aspartic Acid            up
25        VPApC52               3-Sulfinoalanine            up
26       VPApC172                       PC(36:4)            up
27       VPApC159                    ADP-Glucose            up
28        VPApC41          O-Phosphoethanolamine            up
29        VPApC56                Methyl-L-Lysine            up
30        VPApC90                  Cystathionine            up
31       VPApC105                       Thiamine            up
32       VPApC171                       PC(35:2)            up
   vpa_div_cntrl
1      0.8020385
2      0.7628304
3      0.7304178
4      0.6439406
5      0.6390530
6      0.6253119
7      0.6142356
8      0.6080481
9      0.5521731
10     0.4588743
11     0.4538572
12     0.4491053
13     0.4414234
14     0.4391555
15     0.4284642
16     0.4242119
17     0.4078264
18     0.4012096
19     0.3819708
20     0.3793175
21     0.3466167
22     0.1517526
23     1.8933351
24     1.5849782
25     1.5334580
26     1.4855587
27     1.4155297
28     1.3495668
29     1.3212269
30     1.2885595
31     1.2460124
32     1.2131600
#write_csv(path = "./results/vpa_cell_pos_top_hits_w_FC_pval.csv", vpa.cell.pos.top.w.info)
vpa.cell.pos.gathered <- vpa.cell.pos.noNA %>%
  filter(Group == "sample") %>% 
  bind_cols(vpa.cell.pos.surr.var) %>% 
  select(Samples, Treatment, S1:S2, starts_with("VPApC")) %>% 
  gather(key = "Compound", value = "Abundance", VPApC1:VPApC98)
vpa.cell.pos.nested <- vpa.cell.pos.gathered %>% 
  group_by(Compound) %>% 
  nest() %>% 
  mutate(model = map(data, ~lm(Abundance ~ S1 + S2, data = .))) %>% 
  mutate(augment_model = map(model, augment))
vpa.cell.pos.modSV.resid <- vpa.cell.pos.nested %>% 
  unnest(data, augment_model) %>% 
  select(Samples, Treatment, Compound, .resid) %>% 
  spread(Compound, .resid) 
vpa.cell.pos.modSV.resid %>% 
  select(Samples, one_of(vpa.cell.pos.top.w.info$compound_short)) %>% 
  HeatmapPrepAlt("VPApC") %>% 
  t() %>% 
  heatmaply(
    colors = viridis(n = 10, option = "magma"), 
    xlab = "Samples", ylab = "Compounds",
    main = "Statistically significant compounds\nVPA-Only Experiment / Cells / Positive Mode",
    margins = c(50, 50, 75, 30),
    labRow = vpa.cell.pos.top.w.info$compound_full,
    k_row = 2, k_col = 2
    )
### PCA - cleaned data ###
vpa.cell.pos.modSV.pca <- vpa.cell.pos.modSV.resid %>% 
  select(starts_with("VPApC")) %>% 
  mutate_all(scale, center = TRUE, scale = FALSE) %>% 
  as.matrix() %>% 
  prcomp()
vpa.cell.pos.modSV.pca.x <- as.data.frame(vpa.cell.pos.modSV.pca$x)
row.names(vpa.cell.pos.modSV.pca.x) <- vpa.cell.pos.modSV.resid$Samples
vpa.cell.pos.modSV.pca.x <- vpa.cell.pos.modSV.pca.x %>% 
  bind_cols(vpa.cell.pos.modSV.resid %>% select(Treatment))
vpa.cell.pos.modSV.pca.x %>% 
  ggplot(aes(x = PC1, y = PC2, color = Treatment)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC1 (65.4% Var)") +
  ylab("PC2 (12.4% Var)")

vpa.cell.pos.modSV.pca.x %>% 
  ggplot(aes(x = PC3, y = PC4, color = Treatment)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC3 (6.5% Var)") +
  ylab("PC4 (4.5% Var)")

4.4 Media / Negative Mode

vpa.med.neg.smpl.data <- vpa.med.neg.noNA %>% 
    filter(Group == "sample")
vpa.med.neg.data.for.sva <- as.matrix(
  vpa.med.neg.smpl.data[, which(
    colnames(vpa.med.neg.smpl.data) == "VPAnM1"
    ):ncol(vpa.med.neg.smpl.data)]
  )
row.names(vpa.med.neg.data.for.sva) <- vpa.med.neg.smpl.data$Samples
vpa.med.neg.data.for.sva <- t(vpa.med.neg.data.for.sva)
vpa.med.neg.data.pheno <- as.data.frame(vpa.med.neg.smpl.data[, 5:7])
row.names(vpa.med.neg.data.pheno) <- vpa.med.neg.smpl.data$Samples
vpa.med.neg.mod.vpa <- model.matrix(~ as.factor(Treatment), data = vpa.med.neg.data.pheno)
vpa.med.neg.mod0 <- model.matrix(~ 1, data = vpa.med.neg.data.pheno)
set.seed(2018)
num.sv(vpa.med.neg.data.for.sva, vpa.med.neg.mod.vpa, method = "be")
[1] 1
set.seed(2018)
num.sv(vpa.med.neg.data.for.sva, vpa.med.neg.mod.vpa, method = "leek")
[1] 0
set.seed(2018)
vpa.med.neg.sv <- sva(vpa.med.neg.data.for.sva, vpa.med.neg.mod.vpa, vpa.med.neg.mod0)
Number of significant surrogate variables is:  1 
Iteration (out of 5 ):1  2  3  4  5  
vpa.med.neg.surr.var <- as.data.frame(vpa.med.neg.sv$sv)
colnames(vpa.med.neg.surr.var) <- c("S1")



vpa.med.neg.covar <- vpa.med.neg.smpl.pca.x %>% 
  select(Samples, PC1:PC4) %>% 
  inner_join(
    cbind(vpa.med.neg.data.pheno, vpa.med.neg.surr.var) %>% 
      rownames_to_column("Samples"),
    by = "Samples"
    ) %>% 
  full_join(vpa.cell.other.info, by = "Samples") %>% 
  as_tibble()

vpa.med.neg.covar %>% 
  unite("exp_plate", Experiment, Plate, sep = "_") %>% 
  ggplot(aes(exp_plate, S1, color = exp_plate)) +
  geom_boxplot() +
  geom_jitter(size = 2, alpha = 0.8, width = 0.1)

vpa.med.neg.covar %>% 
  unite("exp_plate", Experiment, Plate, sep = "_") %>% 
  ggplot(aes(exp_plate, S1)) +
  geom_boxplot() +
  geom_jitter(size = 2, alpha = 0.8, width = 0.1, aes(color = Treatment))

vpa.med.neg.covar %>% 
  unite("exp_plate", Experiment, Plate, sep = "_") %>% 
  select(Samples:exp_plate) %>% 
  gather("PC", "value", PC1:PC4) %>% 
  ggplot(aes(exp_plate, value)) +
  geom_boxplot() +
  geom_jitter(size = 2, alpha = 0.8, width = 0.1, aes(color = Treatment)) +
  facet_wrap(~ PC, scales = "free")

vpa.med.neg.covar %>% 
  select(PC1:PC4, S1) %>% 
  ggcorr(label = TRUE)

vpa.med.neg.covar %>% 
  select(PC1:PC4, S1) %>% 
  ggpairs()

vpa.med.neg.covar %>% 
  select(PC1:PC4, prot_conc_ug_uL:run_order) %>% 
  ggcorr(label = TRUE)

vpa.med.neg.covar %>% 
  select(PC1:PC4, prot_conc_ug_uL:run_order) %>% 
  ggpairs()

vpa.med.neg.covar %>% 
  ggplot(aes(run_order, PC3, color = Treatment, shape = Experiment)) +
  geom_point(size = 3, alpha = 0.8)

vpa.med.neg.covar %>% 
  ggplot(aes(prot_conc_ug_uL, PC2, color = Treatment, shape = Experiment)) +
  geom_point(size = 3, alpha = 0.8)

vpa.med.neg.covar %>% 
  select(S1:run_order) %>% 
  ggcorr(label = TRUE)

vpa.med.neg.covar %>% 
  select(S1:run_order) %>% 
  ggpairs()

vpa.med.neg.covar %>% 
  ggplot(aes(run_order, S1, color = Treatment, shape = Experiment)) +
  geom_point(size = 3, alpha = 0.8)

vpa.med.neg.covar %>% 
  ggplot(aes(prot_conc_ug_uL, S1, color = Treatment, shape = Experiment)) +
  geom_point(size = 3, alpha = 0.8)

vpa.med.neg.pca.var <- data.frame(vpa.med.neg.smpl.pca.x$PC3)
colnames(vpa.med.neg.pca.var) <- "PC3"
setequal(row.names(vpa.med.neg.smpl.pca.x), row.names(vpa.med.neg.mod.vpa))
[1] TRUE
head(vpa.med.neg.pca.var)
        PC3
1 2.0980413
2 1.2398362
3 0.9684519
4 1.3396242
5 1.0965746
6 0.8120326
colnames(vpa.med.neg.mod.vpa) <- c("cntrl", "DRUGvsCNTRL")
vpa.med.neg.d.sv <- cbind(vpa.med.neg.mod.vpa, vpa.med.neg.pca.var)  
head(vpa.med.neg.d.sv)
         cntrl DRUGvsCNTRL       PC3
T1_P1_C1     1           0 2.0980413
T1_P1_C2     1           0 1.2398362
T1_P1_C3     1           0 0.9684519
T1_P1_V1     1           1 1.3396242
T1_P1_V2     1           1 1.0965746
T1_P1_V3     1           1 0.8120326
vpa.med.neg.top.table <- vpa.med.neg.data.for.sva %>% 
  lmFit(vpa.med.neg.d.sv) %>% 
  eBayes() %>% 
  topTable(coef = "DRUGvsCNTRL", adjust = "bonferroni", p.value = 0.05, n = nrow(vpa.med.neg.data.for.sva))
vpa.med.neg.top.w.info <- vpa.med.neg.top.table %>% 
  rownames_to_column("compound_short") %>% 
  mutate(
    vpa_div_cntrl = 2 ^ logFC,
    change_in_vpa = ifelse(vpa_div_cntrl < 1, "down", "up")
    ) %>% 
  inner_join(vpa.med.neg.compound.info, by = "compound_short") %>% 
  filter(vpa_div_cntrl > 1.2 | vpa_div_cntrl < 0.83) %>%  
  arrange(change_in_vpa, desc(vpa_div_cntrl))
vpa.med.neg.top.w.info %>% 
  select(compound_short, compound_full, change_in_vpa, vpa_div_cntrl)
  compound_short compound_full change_in_vpa vpa_div_cntrl
1        VPAnM24 Valproic acid            up      35.71479
vpa.med.neg.gathered <- vpa.med.neg.noNA %>%
  filter(Group == "sample") %>% 
  bind_cols(vpa.med.neg.pca.var) %>% 
  select(Samples, Treatment, PC3, starts_with("VPAnM")) %>% 
  gather(key = "Compound", value = "Abundance", VPAnM1:VPAnM9)
vpa.med.neg.nested <- vpa.med.neg.gathered %>% 
  group_by(Compound) %>% 
  nest() %>% 
  mutate(model = map(data, ~lm(Abundance ~ PC3, data = .))) %>% 
  mutate(augment_model = map(model, augment))
vpa.med.neg.modSV.resid <- vpa.med.neg.nested %>% 
  unnest(data, augment_model) %>% 
  select(Samples, Treatment, Compound, .resid) %>% 
  spread(Compound, .resid) 
vpa.med.neg.modSV.resid %>% 
  select(Samples:Treatment, one_of(vpa.med.neg.top.w.info$compound_short)) %>% 
  ggplot(aes(Treatment, VPAnM24)) +
  geom_boxplot() +
  geom_jitter(size = 3, alpha = 0.8, aes(color = Treatment))

### PCA - cleaned data ###
vpa.med.neg.modSV.pca <- vpa.med.neg.modSV.resid %>% 
  select(starts_with("VPAnM")) %>% 
  mutate_all(scale, center = TRUE, scale = FALSE) %>% 
  as.matrix() %>% 
  prcomp()
vpa.med.neg.modSV.pca.x <- as.data.frame(vpa.med.neg.modSV.pca$x)
row.names(vpa.med.neg.modSV.pca.x) <- vpa.med.neg.modSV.resid$Samples
vpa.med.neg.modSV.pca.x <- vpa.med.neg.modSV.pca.x %>% 
  bind_cols(vpa.med.neg.modSV.resid %>% select(Treatment))
vpa.med.neg.modSV.pca.x %>% 
  ggplot(aes(x = PC1, y = PC2, color = Treatment)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC1 (63.4% Var)") +
  ylab("PC2 (30.0% Var)")

4.5 Media / Positive Mode

vpa.med.pos.smpl.data <- vpa.med.pos.noNA %>% 
    filter(Group == "sample")
vpa.med.pos.data.for.sva <- as.matrix(
  vpa.med.pos.smpl.data[, which(
    colnames(vpa.med.pos.smpl.data) == "VPApM1"
    ):ncol(vpa.med.pos.smpl.data)]
  )
row.names(vpa.med.pos.data.for.sva) <- vpa.med.pos.smpl.data$Samples
vpa.med.pos.data.for.sva <- t(vpa.med.pos.data.for.sva)
vpa.med.pos.data.pheno <- as.data.frame(vpa.med.pos.smpl.data[, 5:7])
row.names(vpa.med.pos.data.pheno) <- vpa.med.pos.smpl.data$Samples
vpa.med.pos.mod.vpa <- model.matrix(~ as.factor(Treatment), data = vpa.med.pos.data.pheno)
vpa.med.pos.mod0 <- model.matrix(~ 1, data = vpa.med.pos.data.pheno)
set.seed(2018)
num.sv(vpa.med.pos.data.for.sva, vpa.med.pos.mod.vpa, method = "be")
[1] 1
set.seed(2018)
num.sv(vpa.med.pos.data.for.sva, vpa.med.pos.mod.vpa, method = "leek")
[1] 0
set.seed(2018)
vpa.med.pos.sv <- sva(vpa.med.pos.data.for.sva, vpa.med.pos.mod.vpa, vpa.med.pos.mod0)
Number of significant surrogate variables is:  1 
Iteration (out of 5 ):1  2  3  4  5  
vpa.med.pos.surr.var <- as.data.frame(vpa.med.pos.sv$sv)
colnames(vpa.med.pos.surr.var) <- c("S1")



vpa.med.pos.covar <- vpa.med.pos.smpl.pca.x %>% 
  select(Samples, PC1:PC4) %>% 
  inner_join(
    cbind(vpa.med.pos.data.pheno, vpa.med.pos.surr.var) %>% 
      rownames_to_column("Samples"),
    by = "Samples"
    ) %>% 
  full_join(vpa.cell.other.info, by = "Samples") %>% 
  as_tibble()

vpa.med.pos.covar %>% 
  unite("exp_plate", Experiment, Plate, sep = "_") %>% 
  ggplot(aes(exp_plate, S1, color = exp_plate)) +
  geom_boxplot() +
  geom_jitter(size = 2, alpha = 0.8, width = 0.1)

vpa.med.pos.covar %>% 
  unite("exp_plate", Experiment, Plate, sep = "_") %>% 
  ggplot(aes(exp_plate, S1)) +
  geom_boxplot() +
  geom_jitter(size = 2, alpha = 0.8, width = 0.1, aes(color = Treatment))

vpa.med.pos.covar %>% 
  unite("exp_plate", Experiment, Plate, sep = "_") %>% 
  select(Samples:exp_plate) %>% 
  gather("PC", "value", PC1:PC4) %>% 
  ggplot(aes(exp_plate, value)) +
  geom_boxplot() +
  geom_jitter(size = 2, alpha = 0.8, width = 0.1, aes(color = Treatment)) +
  facet_wrap(~ PC, scales = "free")

vpa.med.pos.covar %>% 
  select(PC1:PC4, S1) %>% 
  ggcorr(label = TRUE)

vpa.med.pos.covar %>% 
  select(PC1:PC4, S1) %>% 
  ggpairs()

vpa.med.pos.covar %>% 
  select(PC1:PC4, prot_conc_ug_uL:run_order) %>% 
  ggcorr(label = TRUE)

vpa.med.pos.covar %>% 
  select(PC1:PC4, prot_conc_ug_uL:run_order) %>% 
  ggpairs()

vpa.med.pos.covar %>% 
  ggplot(aes(run_order, PC2, color = Treatment, shape = Experiment)) +
  geom_point(size = 3, alpha = 0.8)

vpa.med.pos.covar %>% 
  ggplot(aes(prot_conc_ug_uL, PC2, color = Treatment, shape = Experiment)) +
  geom_point(size = 3, alpha = 0.8)

vpa.med.pos.covar %>% 
  select(S1:run_order) %>% 
  ggcorr(label = TRUE)

vpa.med.pos.covar %>% 
  select(S1:run_order) %>% 
  ggpairs()

vpa.med.pos.covar %>% 
  ggplot(aes(run_order, S1, color = Treatment, shape = Experiment)) +
  geom_point(size = 3, alpha = 0.8)

vpa.med.pos.covar %>% 
  ggplot(aes(prot_conc_ug_uL, S1, color = Treatment, shape = Experiment)) +
  geom_point(size = 3, alpha = 0.8)

vpa.med.pos.pca.var <- data.frame(vpa.med.pos.smpl.pca.x$PC2)
colnames(vpa.med.pos.pca.var) <- "PC2"
setequal(row.names(vpa.med.pos.smpl.pca.x), row.names(vpa.med.pos.mod.vpa))
[1] TRUE
head(vpa.med.pos.pca.var)
         PC2
1 -1.8760311
2 -1.9221080
3 -1.5321371
4 -1.0724871
5 -0.6100211
6 -1.1385369
colnames(vpa.med.pos.mod.vpa) <- c("cntrl", "DRUGvsCNTRL")
vpa.med.pos.d.sv <- cbind(vpa.med.pos.mod.vpa, vpa.med.pos.pca.var)  
head(vpa.med.pos.d.sv)
         cntrl DRUGvsCNTRL        PC2
T1_P1_C1     1           0 -1.8760311
T1_P1_C2     1           0 -1.9221080
T1_P1_C3     1           0 -1.5321371
T1_P1_V1     1           1 -1.0724871
T1_P1_V2     1           1 -0.6100211
T1_P1_V3     1           1 -1.1385369
vpa.med.pos.top.table <- vpa.med.pos.data.for.sva %>% 
  lmFit(vpa.med.pos.d.sv) %>% 
  eBayes() %>% 
  topTable(coef = "DRUGvsCNTRL", adjust = "bonferroni", p.value = 0.05, n = nrow(vpa.med.pos.data.for.sva))
vpa.med.pos.top.table
data frame with 0 columns and 0 rows

5 Pathway analysis

5.1 KEGG attach

head(vpa.cell.neg.top.w.info)
  compound_short      logFC  AveExpr         t      P.Value    adj.P.Val
1        VPAnC58 -0.3207540 17.48468 -4.615315 1.445584e-04 0.0258759505
2        VPAnC65 -0.3708957 20.56905 -4.826636 8.713540e-05 0.0155972362
3        VPAnC97 -0.3720153 19.32692 -7.495251 2.104994e-07 0.0000376794
4       VPAnC105 -0.3879737 15.24568 -4.509934 1.861965e-04 0.0333291767
5        VPAnC41 -0.4198513 18.18660 -6.550547 1.613484e-06 0.0002888137
6        VPAnC23 -0.4622609 18.27583 -6.245694 3.190768e-06 0.0005711475
          B vpa_div_cntrl change_in_vpa           compound_full
1 0.4099156     0.8006513          down          N-Acetylserine
2 0.9152870     0.7733022          down                Xanthine
3 6.9920292     0.7727024          down            myo-Inositol
4 0.1577535     0.7642022          down N-alpha-Acetylglutamine
5 4.9304963     0.7475017          down              Asparagine
6 4.2413631     0.7258479          down             Thiosulfate
       formula    mass   rt     cas_id
1   C5 H9 N O4 147.054 3.38 16354-58-8
2  C5 H4 N4 O2 152.034 1.92    69-89-6
3    C6 H12 O6 180.064 5.02    87-89-8
4 C7 H12 N2 O4 188.079 5.83  2490-97-3
5  C4 H8 N2 O3 132.054 8.37    70-47-3
6     H2 O3 S2 113.945 4.85 14383-50-7
head(vpa.cell.pos.top.w.info)
  compound_short      logFC  AveExpr         t      P.Value   adj.P.Val
1        VPApC35 -0.3182566 18.67222 -4.853699 7.687773e-05 0.010916637
2        VPApC40 -0.3905657 15.41180 -4.621143 1.350775e-04 0.019181004
3        VPApC51 -0.4532062 16.91997 -4.489349 1.860706e-04 0.026422023
4       VPApC101 -0.6350004 14.04687 -5.753222 8.991813e-06 0.001276837
5        VPApC24 -0.6459926 19.38535 -4.275739 3.128901e-04 0.044430397
6        VPApC49 -0.6773521 14.61980 -4.311187 2.870291e-04 0.040758128
            B vpa_div_cntrl change_in_vpa            compound_full
1  0.58407812     0.8020385          down               Asparagine
2  0.01844396     0.7628304          down       N-Methylnicotinate
3 -0.30211613     0.7304178          down                 Xanthine
4  2.75153845     0.6439406          down Glycerol 1-phosphoserine
5 -0.82081936     0.6390530          down             Nicotinamide
6 -0.73485577     0.6253119          down                 D-Ribose
        formula    mass    rt     cas_id
1   C4 H8 N2 O3 132.054  8.39    70-47-3
2    C7 H7 N O2 137.048 10.39   535-83-1
3   C5 H4 N4 O2 152.034  1.92    69-89-6
4 C6 H14 N O8 P 259.045  8.00 26289-09-8
5    C6 H6 N2 O 122.048  1.90    98-92-0
6     C5 H10 O5 150.052  2.45    50-69-1
head(vpa.med.neg.top.w.info)
  compound_short   logFC  AveExpr        t      P.Value    adj.P.Val
1        VPAnM24 5.15845 18.25088 29.61713 1.105853e-22 4.533997e-21
         B vpa_div_cntrl change_in_vpa compound_full   formula    mass  rt
1 41.71555      35.71479            up Valproic acid C8 H16 O2 144.115 1.3
   cas_id
1 99-66-1
vpa.full.hit.list <- vpa.cell.neg.top.w.info %>% 
  mutate(Mode = "cell.neg") %>% 
  bind_rows(
    vpa.cell.pos.top.w.info %>% 
      mutate(Mode = "cell.pos")
  ) %>% 
  bind_rows(
    vpa.med.neg.top.w.info %>% 
      mutate(Mode = "med.neg")
  ) %>% 
  as.tibble()
vpa.sml.hit.list <- vpa.full.hit.list %>% 
  select(compound_full:formula, cas_id) %>% 
  distinct() %>% 
  arrange(compound_full) %>% 
  left_join(cmpnd.id.db, by = "cas_id")
all.equal(vpa.sml.hit.list$compound_full.x, vpa.sml.hit.list$compound_full.y)
[1] TRUE
#write_csv(path = "./results/vpa_only_exp_hits_w_kegg_sv_mod.csv", vpa.sml.hit.list)
glimpse(vpa.sml.hit.list)
Observations: 58
Variables: 7
$ compound_full.x <chr> "11Z,14Z-Eicosadienoic Acid (20:2 n-6)", "2-Am...
$ formula         <chr> "C20 H36 O2", "C6 H11 N O4", "C3 H7 N O4 S", "...
$ cas_id          <chr> "2091-39-6", "542-32-5", "1115-65-7", "73-24-5...
$ compound_full.y <chr> "11Z,14Z-Eicosadienoic Acid (20:2 n-6)", "2-Am...
$ HMDB            <chr> "HMDB05060", "HMDB00510", "HMDB00996", "HMDB00...
$ Metlin          <chr> "62964", "3271", "5927", "85", "34522", "63203...
$ KEGG            <chr> "C16525", "C00956", "C00606", "C00147", "C0000...

5.2 Nucleotide metabolism plots

nucleotide.metab <- read_csv("./data/pathways/vpa_only_exp_nucleotide_hits.csv")
### Purine Metabolism ###
purine.triphosphates <- c("ATP", "GTP", "dATP", "dGTP")
purine.for.plot <- nucleotide.metab %>% 
  inner_join(vpa.full.hit.list, by = "cas_id") %>% 
  filter(path1 == "Purine" | path2 == "Purine" | path3 == "Purine") %>% 
  select(compound_full.x, compound_short) %>% 
  rename(compound_full = compound_full.x) %>% 
  bind_rows(
    vpa.cell.neg.compound.info %>% 
      filter(compound_full %in% purine.triphosphates) %>% 
      select(compound_full, compound_short)
  ) %>% 
  bind_rows(
    vpa.cell.pos.compound.info %>% 
      filter(compound_full %in% purine.triphosphates) %>% 
      select(compound_full, compound_short)
  )
#write_csv(path = "./data/pathways/purine_pathway.csv", purine.for.plot)  do not run
purine.plot.order <- read_csv("./data/pathways/purine_pathway.csv") %>% 
  mutate(plot_order = factor(Name, levels = Name))
purine.data <- vpa.cell.neg.modSV.resid %>% 
  inner_join(vpa.cell.pos.modSV.resid %>% select(-Treatment), by = "Samples") %>% 
  select(Samples:Treatment, one_of(purine.for.plot$compound_short)) %>% 
  mutate_at(vars(matches("VPA")), scale)
purine.sig.plot <- purine.data %>% 
  gather(key = "compound_short", value = "Abundance", -Samples, -Treatment) %>% 
  inner_join(
    purine.plot.order %>% 
      filter(compound_full != "Aspartic Acid" & compound_full != "Ribose 5-Phosphate"), 
    by = "compound_short"
    ) %>%
  filter(Sig == "Y") %>% 
  ggplot(aes(plot_order, Abundance, color = Treatment)) +
  geom_boxplot(position = position_dodge(0.8)) +
  geom_jitter(size = 2, alpha = 0.5, position = position_dodge(0.8)) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1),
    panel.background = element_rect(fill = "gray90")
    ) +
  xlab("") +
  ylab("") +
 # ggtitle("Normalized Abundance by Treatment Group\nPyrimidine Metabolism\nStatistically Significant Compounds") +
  scale_color_manual(values = c("#56B4E9", "#E69F00"), labels = c("Control", "VPA")) +
  ylim(-2.5, 2.5)
purine.sig.plot

purine.not.sig.plot <- purine.data %>% 
  gather(key = "compound_short", value = "Abundance", -Samples, -Treatment) %>% 
  inner_join(purine.plot.order, by = "compound_short") %>%
  filter(Sig == "N") %>% 
  ggplot(aes(plot_order, Abundance, color = Treatment)) +
  geom_boxplot(position = position_dodge(0.8)) +
  geom_jitter(size = 2, alpha = 0.5, position = position_dodge(0.8)) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1),
    panel.background = element_rect(fill = "gray90")
    ) +
  xlab("") +
  ylab("") +
  #ggtitle("Normalized Abundance by Treatment Group\nPyrimidine Metabolism\nNot Statistically Significant Compounds") +
  scale_color_manual(values = c("#56B4E9", "#E69F00"), labels = c("Control", "VPA")) +
  ylim(-2.5, 2.5)
purine.not.sig.plot

### Pyrimidine Metabolism ###
pyrimidine.triphosphates <- c("UTP", "CTP", "dTTP", "dCTP")
pyrimidine.for.plot <- nucleotide.metab %>% 
  inner_join(vpa.full.hit.list, by = "cas_id") %>% 
  filter(path1 == "Pyrimidine" | path2 == "Pyrimidine" | path3 == "Pyrimidine") %>% 
  select(compound_full.x, compound_short) %>% 
  rename(compound_full = compound_full.x) %>% 
  bind_rows(
    vpa.cell.neg.compound.info %>% 
      filter(compound_full %in% pyrimidine.triphosphates) %>% 
      select(compound_full, compound_short)
  ) %>% 
  bind_rows(
    vpa.cell.pos.compound.info %>% 
      filter(compound_full %in% pyrimidine.triphosphates) %>% 
      select(compound_full, compound_short)
  ) %>% 
  distinct()
#write_csv(path = "./data/pathways/pyrimidine_pathway.csv", pyrimidine.for.plot)
pyrimidine.plot.order <- read_csv("./data/pathways/pyrimidine_pathway.csv") %>% 
  mutate(plot_order = factor(Name, levels = Name))
pyrimidine.data <- vpa.cell.neg.modSV.resid %>% 
  inner_join(vpa.cell.pos.modSV.resid %>% select(-Treatment), by = "Samples") %>% 
  select(Samples:Treatment, one_of(pyrimidine.for.plot$compound_short)) %>% 
  mutate_at(vars(matches("VPA")), scale)
pyrimidine.sig.plot <- pyrimidine.data %>% 
  gather(key = "compound_short", value = "Abundance", -Samples, -Treatment) %>% 
  inner_join(
    pyrimidine.plot.order %>% 
      filter(compound_full != "Ribose 5-Phosphate"), by = "compound_short") %>%
  filter(Sig == "Y") %>% 
  ggplot(aes(plot_order, Abundance, color = Treatment)) +
  geom_boxplot(position = position_dodge(0.8)) +
  geom_jitter(size = 2, alpha = 0.5, position = position_dodge(0.8)) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1),
    panel.background = element_rect(fill = "gray90")
    ) +
  xlab("") +
  ylab("") +
  #ggtitle("Normalized Abundance by Treatment Group\nPyrimidine Metabolism\nStatistically Significant Compounds") +
  scale_color_manual(values = c("#56B4E9", "#E69F00"), labels = c("Control", "VPA")) +
  ylim(-2.5, 2.5)
pyrimidine.sig.plot

pyrimidine.not.sig.plot <- pyrimidine.data %>% 
  gather(key = "compound_short", value = "Abundance", -Samples, -Treatment) %>% 
  inner_join(pyrimidine.plot.order, by = "compound_short") %>%
  filter(Sig == "N") %>% 
  ggplot(aes(plot_order, Abundance, color = Treatment)) +
  geom_boxplot(position = position_dodge(0.8)) +
  geom_jitter(size = 2, alpha = 0.5, position = position_dodge(0.8)) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1),
    panel.background = element_rect(fill = "gray90")
    ) +
  xlab("") +
  ylab("") +
  #ggtitle("Normalized Abundance by Treatment Group\nPyrimidine Metabolism\nNot Statistically Significant Compounds") +
  scale_color_manual(values = c("#56B4E9", "#E69F00"), labels = c("Control", "VPA")) +
  ylim(-2.5, 2.5)
pyrimidine.not.sig.plot

### Pentose Phosphate Pathway ###
ppp.for.plot <- nucleotide.metab %>% 
  inner_join(vpa.full.hit.list, by = "cas_id") %>% 
  filter(path1 == "PPP" | path2 == "PPP" | path3 == "PPP") %>% 
  select(compound_full.x, compound_short) %>% 
  rename(compound_full = compound_full.x)
#write_csv(path = "./data/pathways/ppp_pathway.csv", ppp.for.plot)
ppp.plot.order <- read_csv("./data/pathways/ppp_pathway.csv") %>% 
  mutate(plot_order = factor(Name, levels = Name))
ppp.data <- vpa.cell.neg.modSV.resid %>% 
  inner_join(vpa.cell.pos.modSV.resid %>% select(-Treatment), by = "Samples") %>% 
  select(Samples:Treatment, one_of(ppp.for.plot$compound_short)) %>% 
  mutate_at(vars(matches("VPA")), scale)
ppp.sig.plot <- ppp.data %>% 
  gather(key = "compound_short", value = "Abundance", -Samples, -Treatment) %>% 
  inner_join(ppp.plot.order, by = "compound_short") %>%
  ggplot(aes(plot_order, Abundance, color = Treatment)) +
  geom_boxplot(position = position_dodge(0.8)) +
  geom_jitter(size = 2, alpha = 0.5, position = position_dodge(0.8)) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1),
    panel.background = element_rect(fill = "gray90"),
    legend.justification = "top"
    ) +
  xlab("") +
  ylab("") +
  #ggtitle("Normalized Abundance by Treatment Group\nppp Metabolism\nStatistically Significant Compounds") +
  scale_color_manual(values = c("#56B4E9", "#E69F00"), labels = c("Control", "VPA")) +
  ylim(-2.75, 2.5) +
  scale_x_discrete(labels = c("Glucose\n6-Phosphate", "Ribose", "Deoxyribose", "Glyceraldehyde\n3-Phosphate (Neg)", "Glyceraldehyde\n3-Phosphate (Pos)", "Ribose\n5-Phosphate", "Sedoheptulose\n7-Phosphate"))
ppp.sig.plot

### all together
nuc.legend <- get_legend(ppp.sig.plot)
vpa.only.nuc.sig.plot.grid <- plot_grid(
  purine.sig.plot + theme(legend.position = "none", plot.margin = unit(c(6,6,0,0), "pt")), 
  pyrimidine.sig.plot + theme(legend.position = "none", plot.margin = unit(c(0,6,0,0), "pt")), 
  plot_grid(
    ppp.sig.plot + theme(legend.position = "none", plot.margin = unit(c(0,6,0,0), "pt")),
    nuc.legend,
    rel_widths = c(1.45, 0.55)
    ),
  ncol = 1, labels = c("A", "B", "C"),
  rel_heights = c(1, 1, 1)
  )
vpa.only.nuc.sig.plot.grid

#save_plot("./results/vpa_only_exp_nuc_sig.png", vpa.only.nuc.sig.plot.grid, base_height = 10, base_width = 8)
nuc.not.sig.row.plots <- plot_grid(
  purine.not.sig.plot, 
  pyrimidine.not.sig.plot, 
  labels = c("A", "B"), 
  ncol = 1
  )
nuc.not.sig.row.plots

#save_plot("./results/vpa_only_exp_nuc_not_sig.png", nuc.not.sig.row.plots, base_width = 8, base_height = 8)

6 Session Info

sessionInfo()
R version 3.5.1 (2018-07-02)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 7 x64 (build 7601) Service Pack 1

Matrix products: default

locale:
[1] LC_COLLATE=English_United States.1252 
[2] LC_CTYPE=English_United States.1252   
[3] LC_MONETARY=English_United States.1252
[4] LC_NUMERIC=C                          
[5] LC_TIME=English_United States.1252    

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] bindrcpp_0.2.2      ggridges_0.5.1      broom_0.5.0        
 [4] limma_3.36.5        sva_3.28.0          BiocParallel_1.14.2
 [7] genefilter_1.62.0   mgcv_1.8-25         nlme_3.1-137       
[10] heatmaply_0.15.2    viridis_0.5.1       viridisLite_0.3.0  
[13] plotly_4.8.0        GGally_1.4.0        cowplot_0.9.3      
[16] forcats_0.3.0       stringr_1.3.1       dplyr_0.7.8        
[19] purrr_0.2.5         readr_1.1.1         tidyr_0.8.2        
[22] tibble_1.4.2        ggplot2_3.1.0       tidyverse_1.2.1    

loaded via a namespace (and not attached):
  [1] colorspace_1.3-2     class_7.3-14         modeltools_0.2-22   
  [4] mclust_5.4.1         rprojroot_1.3-2      rstudioapi_0.8      
  [7] flexmix_2.3-14       bit64_0.9-7          fansi_0.4.0         
 [10] AnnotationDbi_1.42.1 mvtnorm_1.0-8        lubridate_1.7.4     
 [13] xml2_1.2.0           splines_3.5.1        codetools_0.2-15    
 [16] robustbase_0.93-3    knitr_1.20           jsonlite_1.5        
 [19] annotate_1.58.0      cluster_2.0.7-1      kernlab_0.9-27      
 [22] shiny_1.2.0          compiler_3.5.1       httr_1.3.1          
 [25] backports_1.1.2      assertthat_0.2.0     Matrix_1.2-15       
 [28] lazyeval_0.2.1       cli_1.0.1            later_0.7.5         
 [31] htmltools_0.3.6      tools_3.5.1          gtable_0.2.0        
 [34] glue_1.3.0           reshape2_1.4.3       Rcpp_1.0.0          
 [37] Biobase_2.40.0       cellranger_1.1.0     trimcluster_0.1-2.1 
 [40] gdata_2.18.0         crosstalk_1.0.0      iterators_1.0.10    
 [43] fpc_2.1-11.1         rvest_0.3.2          mime_0.6            
 [46] gtools_3.8.1         XML_3.98-1.16        dendextend_1.9.0    
 [49] DEoptimR_1.0-8       MASS_7.3-51.1        scales_1.0.0        
 [52] TSP_1.1-6            promises_1.0.1       hms_0.4.2           
 [55] parallel_3.5.1       RColorBrewer_1.1-2   yaml_2.2.0          
 [58] memoise_1.1.0        gridExtra_2.3        reshape_0.8.8       
 [61] stringi_1.2.4        RSQLite_2.1.1        gclus_1.3.1         
 [64] S4Vectors_0.18.3     foreach_1.4.4        seriation_1.2-3     
 [67] caTools_1.17.1.1     BiocGenerics_0.26.0  matrixStats_0.54.0  
 [70] rlang_0.3.0.1        pkgconfig_2.0.2      prabclus_2.2-6      
 [73] bitops_1.0-6         evaluate_0.12        lattice_0.20-38     
 [76] bindr_0.1.1          labeling_0.3         htmlwidgets_1.3     
 [79] bit_1.1-14           tidyselect_0.2.5     plyr_1.8.4          
 [82] magrittr_1.5         R6_2.3.0             IRanges_2.14.12     
 [85] gplots_3.0.1         DBI_1.0.0            pillar_1.3.0        
 [88] haven_1.1.2          whisker_0.3-2        withr_2.1.2         
 [91] survival_2.43-1      RCurl_1.95-4.11      nnet_7.3-12         
 [94] modelr_0.1.2         crayon_1.3.4         utf8_1.1.4          
 [97] KernSmooth_2.23-15   rmarkdown_1.10       grid_3.5.1          
[100] readxl_1.1.0         data.table_1.11.8    blob_1.1.1          
[103] digest_0.6.18        diptest_0.75-7       webshot_0.5.1       
[106] xtable_1.8-3         httpuv_1.4.5         stats4_3.5.1        
[109] munsell_0.5.0        registry_0.5